/// Framework Core Low-Level Unicode UTF-8 UTF-16 Ansi Conversion // - this unit is a part of the Open Source Synopse mORMot framework 2, // licensed under a MPL/GPL/LGPL three license - see LICENSE.md unit mormot.core.unicode; { ***************************************************************************** Efficient Unicode Conversion Classes shared by all framework units - UTF-8 Efficient Encoding / Decoding - UTF-8 / UTF-16 / Ansi Conversion Classes - Text File Loading with BOM/Unicode Support - Low-Level String Conversion Functions - Text Case-(in)sensitive Conversion and Comparison - UTF-8 String Manipulation Functions - TRawUtf8DynArray Processing Functions - Operating-System Independent Unicode Process ***************************************************************************** } interface {$I mormot.defines.inc} uses classes, sysutils, mormot.core.base, mormot.core.os; { *************** UTF-8 Efficient Encoding / Decoding } // some constants used for UTF-8 conversion, including surrogates type // see http://floodyberry.wordpress.com/2007/04/14/utf-8-conversion-tricks {$ifdef USERECORDWITHMETHODS} TUtf8Table = record {$else} TUtf8Table = object {$endif USERECORDWITHMETHODS} public Lookup: array[byte] of byte; Extra: array[0..6] of record offset, minimum: cardinal; end; FirstByte: array[2..6] of byte; /// retrieve a >127 UCS4 CodePoint from UTF-8 function GetHighUtf8Ucs4(var U: PUtf8Char): Ucs4CodePoint; end; PUtf8Table = ^TUtf8Table; const UTF8_ASCII = 0; UTF8_INVALID = 6; UTF8_ZERO = 7; UTF8_TABLE: TUtf8Table = ( Lookup: ( 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6); Extra: ( (offset: $00000000; minimum: $00010000), (offset: $00003080; minimum: $00000080), (offset: $000e2080; minimum: $00000800), (offset: $03c82080; minimum: $00010000), (offset: $fa082080; minimum: $00200000), (offset: $82082080; minimum: $04000000), (offset: $00000000; minimum: $04000000)); FirstByte: ( $c0, $e0, $f0, $f8, $fc)); UTF8_EXTRA_SURROGATE = 3; UTF16_HISURROGATE_MIN = $d800; UTF16_HISURROGATE_MAX = $dbff; UTF16_LOSURROGATE_MIN = $dc00; UTF16_LOSURROGATE_MAX = $dfff; UTF16_SURROGATE_OFFSET = $d7c0; /// replace any incoming character whose value is unrepresentable in Unicode // - set e.g. by GetUtf8WideChar(), Utf8UpperReference() or // RawUnicodeToUtf8() when ccfReplacementCharacterForUnmatchedSurrogate is set // - encoded as $ef $bf $bd bytes in UTF-8 UNICODE_REPLACEMENT_CHARACTER = $fffd; /// internal function, used to retrieve a >127 US4 CodePoint from UTF-8 // - not to be called directly, but from inlined higher-level functions // - here U^ shall be always >= #80 // - typical use is as such: // ! ch := ord(P^); // ! if ch and $80=0 then // ! inc(P) else // ! ch := GetHighUtf8Ucs4(P); function GetHighUtf8Ucs4(var U: PUtf8Char): Ucs4CodePoint; {$ifdef HASINLINE}inline;{$endif} /// decode UTF-16 WideChar from UTF-8 input buffer // - any surrogate (Ucs4>$ffff) is returned as UNICODE_REPLACEMENT_CHARACTER=$fffd function GetUtf8WideChar(P: PUtf8Char): cardinal; {$ifdef HASINLINE}inline;{$endif} /// get the UCS4 CodePoint stored in P^ (decode UTF-8 if necessary) function NextUtf8Ucs4(var P: PUtf8Char): Ucs4CodePoint; {$ifdef HASINLINE}inline;{$endif} /// UTF-8 encode one UTF-16 encoded UCS4 CodePoint into Dest // - return the number of bytes written into Dest (i.e. from 1 up to 6) // - Source will contain the next UTF-16 character // - this method DOES properly handle UTF-16 surrogate pairs function Utf16CharToUtf8(Dest: PUtf8Char; var Source: PWord): integer; /// UTF-8 encode one UCS4 CodePoint into Dest // - return the number of bytes written into Dest (i.e. from 1 up to 6) // - this method DOES properly handle UTF-16 surrogate pairs function Ucs4ToUtf8(ucs4: Ucs4CodePoint; Dest: PUtf8Char): PtrInt; {$ifdef HASINLINE}inline;{$endif} type /// option set for RawUnicodeToUtf8() conversion TCharConversionFlags = set of ( ccfNoTrailingZero, ccfReplacementCharacterForUnmatchedSurrogate); /// convert a UTF-16 PWideChar buffer into a UTF-8 string procedure RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer; var result: RawUtf8; Flags: TCharConversionFlags = [ccfNoTrailingZero]); overload; /// convert a UTF-16 PWideChar buffer into a UTF-8 temporary buffer procedure RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer; var result: TSynTempBuffer; Flags: TCharConversionFlags); overload; /// convert a UTF-16 PWideChar buffer into a UTF-8 string function RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer; Flags: TCharConversionFlags = [ccfNoTrailingZero]): RawUtf8; overload; {$ifdef HASINLINE}inline;{$endif} /// convert a UTF-16 PWideChar buffer into a UTF-8 buffer // - replace system.UnicodeToUtf8 implementation, which is rather slow // since Delphi 2009+ // - append a trailing #0 to the ending PUtf8Char, unless ccfNoTrailingZero is set // - if ccfReplacementCharacterForUnmatchedSurrogate is set, this function will identify // unmatched surrogate pairs and replace them with UNICODE_REPLACEMENT_CHARACTER - // see https://en.wikipedia.org/wiki/Specials_(Unicode_block) function RawUnicodeToUtf8(Dest: PUtf8Char; DestLen: PtrInt; Source: PWideChar; SourceLen: PtrInt; Flags: TCharConversionFlags): PtrInt; overload; /// convert a UTF-16 PWideChar buffer into a UTF-8 string // - this version doesn't resize the resulting RawUtf8 string, but return // the new resulting RawUtf8 byte count into Utf8Length function RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer; out Utf8Length: integer): RawUtf8; overload; /// convert an UTF-8 encoded text into a WideChar (UTF-16) buffer // - faster than System.Utf8ToUnicode // - sourceBytes can by 0, therefore length is computed from zero terminated source // - enough place must be available in dest buffer (guess is sourceBytes*3+2) // - a WideChar(#0) is added at the end (if something is written) unless // NoTrailingZero is TRUE // - returns the BYTE count written in dest, excluding the ending WideChar(#0) function Utf8ToWideChar(dest: PWideChar; source: PUtf8Char; sourceBytes: PtrInt = 0; NoTrailingZero: boolean = false): PtrInt; overload; /// convert an UTF-8 encoded text into a WideChar (UTF-16) buffer // - faster than System.Utf8ToUnicode // - this overloaded function expect a MaxDestChars parameter // - sourceBytes can not be 0 for this function // - enough place must be available in dest buffer (guess is sourceBytes*3+2) // - a WideChar(#0) is added at the end (if something is written) unless // NoTrailingZero is TRUE // - returns the BYTE COUNT (not WideChar count) written in dest, excluding the // ending WideChar(#0) function Utf8ToWideChar(dest: PWideChar; source: PUtf8Char; MaxDestChars, sourceBytes: PtrInt; NoTrailingZero: boolean = false): PtrInt; overload; /// direct conversion of a UTF-8 encoded buffer into a WinAnsi ShortString buffer // - non WinAnsi chars are replaced by '?' placeholders procedure Utf8ToShortString(var dest: ShortString; source: PUtf8Char); /// calculate the UTF-16 Unicode characters count, UTF-8 encoded in source^ // - count may not match the UCS4 CodePoint, in case of UTF-16 surrogates // - faster than System.Utf8ToUnicode with dest=nil function Utf8ToUnicodeLength(source: PUtf8Char): PtrUInt; /// returns TRUE if the supplied buffer has valid UTF-8 encoding // - will also refuse #0 characters within the buffer // - on Haswell AVX2 Intel/AMD CPUs, will use very efficient ASM var IsValidUtf8Buffer: function(source: PUtf8Char; sourcelen: PtrInt): boolean; /// returns TRUE if the supplied buffer has valid UTF-8 encoding // - will also refuse #0 characters within the buffer // - on Haswell AVX2 Intel/AMD CPUs, will use very efficient ASM, reaching e.g. // 21 GB/s parsing speed on a Core i5-13500 function IsValidUtf8(const source: RawUtf8): boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// returns TRUE if the supplied buffer has valid UTF-8 encoding // - will stop when the buffer contains #0 // - just a wrapper around IsValidUtf8Buffer(source, StrLen(source)) so if you // know the source length, you would better call IsValidUtf8Buffer() directly // - on Haswell AVX2 Intel/AMD CPUs, will use very efficient ASM, reaching e.g. // 15 GB/s parsing speed on a Core i5-13500 - StrLen() itself runs at 37 GB/s function IsValidUtf8(source: PUtf8Char): boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// detect UTF-8 content and mark the variable with the CP_UTF8 codepage // - to circumvent FPC concatenation bug with CP_UTF8 and CP_RAWBYTESTRING procedure DetectRawUtf8(var source: RawByteString); {$ifndef HASCODEPAGE}{$ifdef HASINLINE}inline;{$endif}{$endif} /// returns TRUE if the supplied buffer has valid UTF-8 encoding with no #1..#31 // control characters // - supplied input is a pointer to a #0 ended text buffer function IsValidUtf8WithoutControlChars(source: PUtf8Char): boolean; overload; /// returns TRUE if the supplied buffer has valid UTF-8 encoding with no #0..#31 // control characters // - supplied input is a RawUtf8 variable function IsValidUtf8WithoutControlChars(const source: RawUtf8): boolean; overload; /// will truncate the supplied UTF-8 value if its length exceeds the specified // UTF-16 Unicode characters count // - count may not match the UCS4 CodePoint, in case of UTF-16 surrogates // - returns FALSE if text was not truncated, TRUE otherwise function Utf8TruncateToUnicodeLength(var text: RawUtf8; maxUtf16: integer): boolean; /// will truncate the supplied UTF-8 value if its length exceeds the specified // bytes count // - this function will ensure that the returned content will contain only valid // UTF-8 sequence, i.e. will trim the whole trailing UTF-8 sequence // - returns FALSE if text was not truncated, TRUE otherwise function Utf8TruncateToLength(var text: RawUtf8; maxBytes: PtrUInt): boolean; /// compute the truncated length of the supplied UTF-8 value if it exceeds the // specified bytes count // - this function will ensure that the returned content will contain only valid // UTF-8 sequence, i.e. will trim the whole trailing UTF-8 sequence // - returns maxBytes if text was not truncated, or the number of fitting bytes function Utf8TruncatedLength(const text: RawUtf8; maxBytes: PtrUInt): PtrInt; overload; /// compute the truncated length of the supplied UTF-8 value if it exceeds the // specified bytes count // - this function will ensure that the returned content will contain only valid // UTF-8 sequence, i.e. will trim the whole trailing UTF-8 sequence // - returns maxBytes if text was not truncated, or the number of fitting bytes function Utf8TruncatedLength(text: PAnsiChar; textlen, maxBytes: PtrUInt): PtrInt; overload; /// calculate the UTF-16 Unicode characters count of the UTF-8 encoded first line // - count may not match the UCS4 CodePoint, in case of UTF-16 surrogates // - end the parsing at first #13 or #10 character function Utf8FirstLineToUtf16Length(source: PUtf8Char): PtrInt; { **************** UTF-8 / UTF-16 / Ansi Conversion Classes } type /// Exception raised by this unit in case of fatal conversion issue ESynUnicode = class(ExceptionWithProps); /// an abstract class to handle Ansi to/from Unicode translation // - implementations of this class will handle efficiently all Code Pages // - this default implementation will use the Operating System APIs // - you should not create your own class instance by yourself, but should // better retrieve an instance using TSynAnsiConvert.Engine(), which will // initialize either a TSynAnsiFixedWidth or a TSynAnsiConvert instance on need TSynAnsiConvert = class protected fCodePage: cardinal; fAnsiCharShift: byte; public /// initialize the internal conversion engine constructor Create(aCodePage: cardinal); reintroduce; virtual; /// returns the engine corresponding to a given code page // - a global list of TSynAnsiConvert instances is handled by the unit - // therefore, caller should not release the returned instance // - will return nil in case of unhandled code page // - is aCodePage is 0, will return CurrentAnsiConvert value class function Engine(aCodePage: cardinal): TSynAnsiConvert; /// direct conversion of a PAnsiChar buffer into an Unicode buffer // - Dest^ buffer must be reserved with at least SourceChars*2 bytes // - this default implementation will use the Operating System APIs // - will append a trailing #0 to the returned PWideChar, unless // NoTrailingZero is set function AnsiBufferToUnicode(Dest: PWideChar; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean = false): PWideChar; overload; virtual; /// direct conversion of a PAnsiChar buffer into a UTF-8 encoded buffer // - Dest^ buffer must be reserved with at least SourceChars*3 bytes // - will append a trailing #0 to the returned PUtf8Char, unless // NoTrailingZero is set // - this default implementation will use the Operating System APIs function AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean = false): PUtf8Char; overload; virtual; {$ifndef PUREMORMOT2} /// convert any Ansi Text into an UTF-16 Unicode String // - returns a value using our RawUnicode kind of string function AnsiToRawUnicode(const AnsiText: RawByteString): RawUnicode; overload; /// convert any Ansi buffer into an Unicode String // - returns a value using our RawUnicode kind of string function AnsiToRawUnicode( Source: PAnsiChar; SourceChars: cardinal): RawUnicode; overload; virtual; {$endif PUREMORMOT2} /// convert any Ansi buffer into an Unicode String // - returns a SynUnicode, i.e. Delphi 2009+ UnicodeString or a WideString function AnsiToUnicodeString( Source: PAnsiChar; SourceChars: cardinal): SynUnicode; overload; /// convert any Ansi buffer into an Unicode String // - returns a SynUnicode, i.e. Delphi 2009+ UnicodeString or a WideString function AnsiToUnicodeString(const Source: RawByteString): SynUnicode; overload; /// convert any Ansi Text into an UTF-8 encoded String // - internally calls AnsiBufferToUtf8 virtual method function AnsiToUtf8(const AnsiText: RawByteString): RawUtf8; virtual; /// direct conversion of a PAnsiChar buffer into a UTF-8 encoded string // - will call AnsiBufferToUnicode() overloaded virtual method procedure AnsiBufferToRawUtf8(Source: PAnsiChar; SourceChars: cardinal; out Value: RawUtf8); overload; virtual; /// direct conversion of an Unicode buffer into a PAnsiChar buffer // - Dest^ buffer must be reserved with at least SourceChars * 3 bytes // - will detect and ignore any trailing UTF-16LE BOM marker // - this default implementation will rely on the Operating System for // all non ASCII-7 chars function UnicodeBufferToAnsi(Dest: PAnsiChar; Source: PWideChar; SourceChars: cardinal): PAnsiChar; overload; virtual; /// direct conversion of an Unicode buffer into an Ansi Text function UnicodeBufferToAnsi(Source: PWideChar; SourceChars: cardinal): RawByteString; overload; virtual; /// convert any Unicode-encoded String into Ansi Text // - internally calls UnicodeBufferToAnsi virtual method function UnicodeStringToAnsi(const Source: SynUnicode): RawByteString; {$ifdef HASINLINE}inline;{$endif} {$ifndef PUREMORMOT2} /// convert any Unicode-encoded String into Ansi Text // - internally calls UnicodeBufferToAnsi virtual method function RawUnicodeToAnsi(const Source: RawUnicode): RawByteString; {$endif PUREMORMOT2} /// direct conversion of an UTF-8 encoded buffer into a PAnsiChar buffer // - Dest^ buffer must be reserved with at least SourceChars bytes // - no trailing #0 is appended to the buffer function Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char; SourceChars: cardinal): PAnsiChar; overload; virtual; /// convert any UTF-8 encoded buffer into Ansi Text // - internally calls Utf8BufferToAnsi virtual method function Utf8BufferToAnsi(Source: PUtf8Char; SourceChars: cardinal): RawByteString; overload; {$ifdef HASINLINE}inline;{$endif} /// convert any UTF-8 encoded buffer into Ansi Text // - internally calls Utf8BufferToAnsi virtual method procedure Utf8BufferToAnsi(Source: PUtf8Char; SourceChars: cardinal; var result: RawByteString); overload; virtual; /// convert any UTF-8 encoded String into Ansi Text // - internally calls Utf8BufferToAnsi virtual method function Utf8ToAnsi(const u: RawUtf8): RawByteString; virtual; /// direct conversion of a UTF-8 encoded string into a WinAnsi <2KB buffer // - will truncate the destination string to DestSize bytes (including the // trailing #0), with a maximum handled size of 2048 bytes // - returns the number of bytes stored in Dest^ (i.e. the position of #0) function Utf8ToAnsiBuffer2K(const S: RawUtf8; Dest: PAnsiChar; DestSize: integer): integer; /// convert any Ansi Text (providing a From converted) into Ansi Text function AnsiToAnsi(From: TSynAnsiConvert; const Source: RawByteString): RawByteString; overload; /// convert any Ansi buffer (providing a From converted) into Ansi Text function AnsiToAnsi(From: TSynAnsiConvert; Source: PAnsiChar; SourceChars: cardinal): RawByteString; overload; /// corresponding code page property CodePage: cardinal read fCodePage; /// corresponding length binary shift used for worst conversion case property AnsiCharShift: byte read fAnsiCharShift; end; /// a class to handle Ansi to/from Unicode translation of fixed width encoding // (i.e. non MBCS) // - this class will handle efficiently all Code Page availables without MBCS // encoding - like WinAnsi (1252) or Russian (1251) // - it will use internal fast look-up tables for such encodings // - this class could take some time to generate, and will consume more than // 64 KB of memory: you should not create your own class instance by yourself, // but should better retrieve an instance using TSynAnsiConvert.Engine(), which // will initialize either a TSynAnsiFixedWidth or a TSynAnsiConvert instance // on need // - this class has some additional methods (e.g. IsValid*) which take // advantage of the internal lookup tables to provide some fast process TSynAnsiFixedWidth = class(TSynAnsiConvert) protected fAnsiToWide: TWordDynArray; fWideToAnsi: TByteDynArray; public /// initialize the internal conversion engine constructor Create(aCodePage: cardinal); override; /// direct conversion of a PAnsiChar buffer into an Unicode buffer // - Dest^ buffer must be reserved with at least SourceChars*2 bytes // - will append a trailing #0 to the returned PWideChar, unless // NoTrailingZero is set function AnsiBufferToUnicode(Dest: PWideChar; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean = false): PWideChar; override; /// direct conversion of a PAnsiChar buffer into a UTF-8 encoded buffer // - Dest^ buffer must be reserved with at least SourceChars*3 bytes // - will append a trailing #0 to the returned PUtf8Char, unless // NoTrailingZero is set function AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean = false): PUtf8Char; override; {$ifndef PUREMORMOT2} /// convert any Ansi buffer into an Unicode String // - returns a value using our RawUnicode kind of string function AnsiToRawUnicode(Source: PAnsiChar; SourceChars: cardinal): RawUnicode; override; {$endif PUREMORMOT2} /// direct conversion of an Unicode buffer into a PAnsiChar buffer // - Dest^ buffer must be reserved with at least SourceChars * 3 bytes // - will detect and ignore any trailing UTF-16LE BOM marker // - this overridden version will use internal lookup tables for fast process function UnicodeBufferToAnsi(Dest: PAnsiChar; Source: PWideChar; SourceChars: cardinal): PAnsiChar; override; /// direct conversion of an UTF-8 encoded buffer into a PAnsiChar buffer // - Dest^ buffer must be reserved with at least SourceChars bytes // - no trailing #0 is appended to the buffer // - non Ansi compatible characters are replaced as '?' function Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char; SourceChars: cardinal): PAnsiChar; override; /// conversion of a wide char into the corresponding Ansi character // - return -1 for an unknown WideChar in the current code page function WideCharToAnsiChar(wc: cardinal): integer; /// return TRUE if the supplied unicode buffer only contains characters of // the corresponding Ansi code page // - i.e. if the text can be displayed using this code page function IsValidAnsi(WideText: PWideChar; Length: PtrInt): boolean; overload; /// return TRUE if the supplied unicode buffer only contains characters of // the corresponding Ansi code page // - i.e. if the text can be displayed using this code page function IsValidAnsi(WideText: PWideChar): boolean; overload; /// return TRUE if the supplied UTF-8 buffer only contains characters of // the corresponding Ansi code page // - i.e. if the text can be displayed using this code page function IsValidAnsiU(Utf8Text: PUtf8Char): boolean; /// return TRUE if the supplied UTF-8 buffer only contains 8-bit characters // of the corresponding Ansi code page // - i.e. if the text can be displayed with only 8-bit unicode characters // (e.g. no "tm" or such) within this code page function IsValidAnsiU8Bit(Utf8Text: PUtf8Char): boolean; /// direct access to the Ansi-To-Unicode lookup table // - use this array like AnsiToWide: array[byte] of word property AnsiToWide: TWordDynArray read fAnsiToWide; /// direct access to the Unicode-To-Ansi lookup table // - use this array like WideToAnsi: array[word] of byte // - any unhandled WideChar will return ord('?') property WideToAnsi: TByteDynArray read fWideToAnsi; end; /// a class to handle UTF-8 to/from Unicode translation // - match the TSynAnsiConvert signature, for code page CP_UTF8 // - this class is mostly a non-operation for conversion to/from UTF-8 TSynAnsiUtf8 = class(TSynAnsiConvert) public /// initialize the internal conversion engine constructor Create(aCodePage: cardinal); override; /// direct conversion of a PAnsiChar UTF-8 buffer into an Unicode buffer // - Dest^ buffer must be reserved with at least SourceChars*2 bytes // - will append a trailing #0 to the returned PWideChar, unless // NoTrailingZero is set function AnsiBufferToUnicode(Dest: PWideChar; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean = false): PWideChar; override; /// direct conversion of a PAnsiChar UTF-8 buffer into a UTF-8 encoded buffer // - Dest^ buffer must be reserved with at least SourceChars*3 bytes // - will append a trailing #0 to the returned PUtf8Char, unless // NoTrailingZero is set function AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean = false): PUtf8Char; override; {$ifndef PUREMORMOT2} /// convert any UTF-8 Ansi buffer into an Unicode String // - returns a value using our RawUnicode kind of string function AnsiToRawUnicode(Source: PAnsiChar; SourceChars: cardinal): RawUnicode; override; {$endif PUREMORMOT2} /// direct conversion of an Unicode buffer into a PAnsiChar UTF-8 buffer // - will detect and ignore any trailing UTF-16LE BOM marker // - Dest^ buffer must be reserved with at least SourceChars * 3 bytes function UnicodeBufferToAnsi(Dest: PAnsiChar; Source: PWideChar; SourceChars: cardinal): PAnsiChar; override; /// direct conversion of an Unicode buffer into an Ansi Text function UnicodeBufferToAnsi(Source: PWideChar; SourceChars: cardinal): RawByteString; override; /// direct conversion of an UTF-8 encoded buffer into a PAnsiChar UTF-8 buffer // - Dest^ buffer must be reserved with at least SourceChars bytes // - no trailing #0 is appended to the buffer function Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char; SourceChars: cardinal): PAnsiChar; override; /// convert any UTF-8 encoded buffer into Ansi Text procedure Utf8BufferToAnsi(Source: PUtf8Char; SourceChars: cardinal; var result: RawByteString); override; /// convert any UTF-8 encoded String into Ansi Text // - directly assign the input as result, since no conversion is needed function Utf8ToAnsi(const u: RawUtf8): RawByteString; override; /// convert any Ansi Text into an UTF-8 encoded String // - directly assign the input as result, since no conversion is needed function AnsiToUtf8(const AnsiText: RawByteString): RawUtf8; override; /// direct conversion of a PAnsiChar buffer into a UTF-8 encoded string procedure AnsiBufferToRawUtf8(Source: PAnsiChar; SourceChars: cardinal; out Value: RawUtf8); override; end; /// a class to handle UTF-16 to/from Unicode translation // - match the TSynAnsiConvert signature, for code page CP_UTF16 // - even if UTF-16 is not an Ansi format, code page CP_UTF16 may have been // used to store UTF-16 encoded binary content // - this class is mostly a non-operation for conversion to/from Unicode TSynAnsiUtf16 = class(TSynAnsiConvert) public /// initialize the internal conversion engine constructor Create(aCodePage: cardinal); override; /// direct conversion of a PAnsiChar UTF-16 buffer into an Unicode buffer // - Dest^ buffer must be reserved with at least SourceChars*2 bytes // - will append a trailing #0 to the returned PWideChar, unless // NoTrailingZero is set function AnsiBufferToUnicode(Dest: PWideChar; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean = false): PWideChar; override; /// direct conversion of a PAnsiChar UTF-16 buffer into a UTF-8 encoded buffer // - Dest^ buffer must be reserved with at least SourceChars*3 bytes // - will append a trailing #0 to the returned PUtf8Char, unless // NoTrailingZero is set function AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean = false): PUtf8Char; override; {$ifndef PUREMORMOT2} /// convert any UTF-16 Ansi buffer into an Unicode String // - returns a value using our RawUnicode kind of string function AnsiToRawUnicode(Source: PAnsiChar; SourceChars: cardinal): RawUnicode; override; {$endif PUREMORMOT2} /// direct conversion of an Unicode buffer into a PAnsiChar UTF-16 buffer // - Dest^ buffer must be reserved with at least SourceChars * 3 bytes function UnicodeBufferToAnsi(Dest: PAnsiChar; Source: PWideChar; SourceChars: cardinal): PAnsiChar; override; /// direct conversion of an UTF-8 encoded buffer into a PAnsiChar UTF-16 buffer // - Dest^ buffer must be reserved with at least SourceChars bytes // - no trailing #0 is appended to the buffer function Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char; SourceChars: cardinal): PAnsiChar; override; end; var /// global TSynAnsiConvert instance to handle WinAnsi encoding (code page 1252) // - this instance is global and instantied during the whole program life time // - it will be created from hard-coded values, and not using the system API, // since it appeared that some systems (e.g. in Russia) did tweak the registry // so that 1252 code page maps 1251 code page WinAnsiConvert: TSynAnsiFixedWidth; /// global TSynAnsiConvert instance to handle current system encoding // - this is the encoding as used by the AnsiString type, so will be used // before Delphi 2009 to speed-up RTL string handling (especially for UTF-8) // - this instance is global and instantied during the whole program life time CurrentAnsiConvert: TSynAnsiConvert; /// global TSynAnsiConvert instance to handle UTF-8 encoding (code page CP_UTF8) // - this instance is global and instantied during the whole program life time Utf8AnsiConvert: TSynAnsiUtf8; /// global TSynAnsiConvert instance with no encoding (RawByteString/RawBlob) RawByteStringConvert: TSynAnsiFixedWidth; /// check if a code page is known to be of fixed width, i.e. not MBCS // - i.e. will be implemented as a TSynAnsiFixedWidth function IsFixedWidthCodePage(aCodePage: cardinal): boolean; /// return a code page number into human-friendly text function CodePageToText(aCodePage: cardinal): TShort16; { *************** Text File Loading with BOM/Unicode Support } type /// text file layout, as returned by BomFile() and StringFromBomFile() // - bomNone means there was no BOM recognized // - bomUnicode stands for UTF-16 LE encoding (as on Windows products) // - bomUtf8 stands for a UTF-8 BOM (as on Windows products) TBomFile = ( bomNone, bomUnicode, bomUtf8); const /// UTF-16LE BOM WideChar marker, as existing e.g. in some UTF-16 Windows files BOM_UTF16LE = #$FEFF; /// check the file BOM at the beginning of a file buffer // - BOM is common only with Microsoft products // - returns bomNone if no BOM was recognized // - returns bomUnicode or bomUtf8 if UTF-16LE or UTF-8 BOM were recognized: // and will adjust Buffer/BufferSize to ignore the leading 2 or 3 bytes function BomFile(var Buffer: pointer; var BufferSize: PtrInt): TBomFile; /// read a file into a temporary variable, check the BOM, and adjust the buffer function StringFromBomFile(const FileName: TFileName; out FileContent: RawByteString; out Buffer: pointer; out BufferSize: PtrInt): TBomFile; /// read a File content into a RawUtf8, detecting any leading BOM // - will assume text file with no BOM is already UTF-8 encoded // - an alternative to StringFromFile() if you want to handle UTF-8 content // and the files are likely to be natively UTF-8 encoded, or with a BOM function RawUtf8FromFile(const FileName: TFileName): RawUtf8; {$ifdef HASINLINE} inline; {$endif} /// read a File content into a RawUtf8, detecting any leading BOM // - assume file with no BOM is encoded with the current Ansi code page, not // UTF-8, unless AssumeUtf8IfNoBom is true and it behaves like RawUtf8FromFile() function AnyTextFileToRawUtf8(const FileName: TFileName; AssumeUtf8IfNoBom: boolean = false): RawUtf8; /// read a File content into a RTL string, detecting any leading BOM // - assume file with no BOM is encoded with the current Ansi code page, not UTF-8 // - if ForceUtf8 is true, won't detect the BOM but assume whole file is UTF-8 function AnyTextFileToString(const FileName: TFileName; ForceUtf8: boolean = false): string; /// read a File content into SynUnicode string, detecting any leading BOM // - assume file with no BOM is encoded with the current Ansi code page, not UTF-8 // - if ForceUtf8 is true, won't detect the BOM but assume whole file is UTF-8 function AnyTextFileToSynUnicode(const FileName: TFileName; ForceUtf8: boolean = false): SynUnicode; { *************** Low-Level String Conversion Functions } /// will fast replace all #0 chars as ~ // - could be used after UniqueRawUtf8() on a in-placed modified JSON buffer, // in which all values have been ended with #0 // - you can optionally specify a maximum size, in bytes (this won't reallocate // the string, but just add a #0 at some point in the UTF-8 buffer) // - could allow logging of parsed input e.g. after an exception procedure UniqueRawUtf8ZeroToTilde(var u: RawUtf8; MaxSize: PtrInt = maxInt); /// convert a binary buffer into a fake ASCII/UTF-8 content without any #0 input // - will use ~ char to escape any #0 as ~0 pair (and plain ~ as ~~ pair) // - output is just a bunch of non 0 bytes, so not trully valid UTF-8 content // - may be used as an alternative to Base64 encoding if 8-bit chars are allowed // - call ZeroedRawUtf8() as reverse function function UnZeroed(const bin: RawByteString): RawUtf8; /// convert a fake UTF-8 buffer without any #0 input back into its original binary // - may be used as an alternative to Base64 decoding if 8-bit chars are allowed // - call UnZeroedRawUtf8() as reverse function function Zeroed(const u: RawUtf8): RawByteString; /// conversion of a wide char into a WinAnsi (CodePage 1252) char // - return '?' for an unknown WideChar in code page 1252 function WideCharToWinAnsiChar(wc: cardinal): AnsiChar; {$ifdef HASINLINE}inline;{$endif} /// conversion of a wide char into a WinAnsi (CodePage 1252) char index // - return -1 for an unknown WideChar in code page 1252 function WideCharToWinAnsi(wc: cardinal): integer; {$ifdef HASINLINE}inline;{$endif} /// return TRUE if the supplied unicode buffer only contains WinAnsi characters // - i.e. if the text can be displayed using ANSI_CHARSET function IsWinAnsi(WideText: PWideChar): boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// return TRUE if the supplied unicode buffer only contains WinAnsi characters // - i.e. if the text can be displayed using ANSI_CHARSET function IsWinAnsi(WideText: PWideChar; Length: integer): boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// return TRUE if the supplied UTF-8 buffer only contains WinAnsi characters // - i.e. if the text can be displayed using ANSI_CHARSET function IsWinAnsiU(Utf8Text: PUtf8Char): boolean; {$ifdef HASINLINE}inline;{$endif} /// return TRUE if the supplied UTF-8 buffer only contains WinAnsi 8-bit characters // - i.e. if the text can be displayed using ANSI_CHARSET with only 8-bit unicode // characters (e.g. no "tm" or such) function IsWinAnsiU8Bit(Utf8Text: PUtf8Char): boolean; {$ifdef HASINLINE}inline;{$endif} /// direct conversion of an AnsiString with an unknown code page into an // UTF-8 encoded String // - will assume CurrentAnsiConvert.CodePage prior to Delphi 2009 // - newer UNICODE versions of Delphi will retrieve the code page from string procedure AnyAnsiToUtf8(const s: RawByteString; var result: RawUtf8); overload; /// direct conversion of an AnsiString with an unknown code page into an // UTF-8 encoded String // - will assume CurrentAnsiConvert.CodePage prior to Delphi 2009 // - newer UNICODE versions of Delphi will retrieve the code page from string // - use AnsiToUtf8() if you want to specify the codepage function AnyAnsiToUtf8(const s: RawByteString): RawUtf8; overload; {$ifdef HASINLINE}inline;{$endif} /// convert an AnsiString (of a given code page) into a UTF-8 string // - use AnyAnsiToUtf8() if you want to use the codepage of the input string // - wrapper around TSynAnsiConvert.Engine(CodePage).AnsiToUtf8() function AnsiToUtf8(const Ansi: RawByteString; CodePage: integer): RawUtf8; {$ifdef HASINLINE}inline;{$endif} /// convert an AnsiChar buffer (of a given code page) into a UTF-8 string // - the destination code page should be supplied // - wrapper around TSynAnsiConvert.Engine(CodePage).AnsiBufferToRawUtf8() procedure AnsiCharToUtf8(P: PAnsiChar; L: integer; var result: RawUtf8; CodePage: integer); {$ifdef HASINLINE}inline;{$endif} /// convert an AnsiString (of a given code page) into a RTL string // - the destination code page should be supplied // - wrapper around TSynAnsiConvert.Engine(CodePage) and string conversion function AnsiToString(const Ansi: RawByteString; CodePage: integer): string; {$ifdef HASINLINE}inline;{$endif} /// direct conversion of a WinAnsi (CodePage 1252) string into a UTF-8 encoded String // - faster than SysUtils: don't use Utf8Encode(WideString) -> no Windows.Global(), // and use a fixed pre-calculated array for individual chars conversion function WinAnsiToUtf8(const S: WinAnsiString): RawUtf8; overload; {$ifdef HASINLINE}inline;{$endif} /// direct conversion of a WinAnsi (CodePage 1252) string into a UTF-8 encoded String // - faster than SysUtils: don't use Utf8Encode(WideString) -> no Windows.Global(), // and use a fixed pre-calculated array for individual chars conversion function WinAnsiToUtf8(WinAnsi: PAnsiChar; WinAnsiLen: PtrInt): RawUtf8; overload; {$ifdef HASINLINE}inline;{$endif} /// direct conversion of a WinAnsi PAnsiChar buffer into a UTF-8 encoded buffer // - Dest^ buffer must be reserved with at least SourceChars*3 // - call internally WinAnsiConvert fast conversion class function WinAnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar; SourceChars: cardinal): PUtf8Char; {$ifdef HASINLINE}inline;{$endif} /// direct conversion of a WinAnsi ShortString into a UTF-8 text // - call internally WinAnsiConvert fast conversion class function ShortStringToUtf8(const source: ShortString): RawUtf8; {$ifdef HASINLINE}inline;{$endif} /// direct conversion of a WinAnsi (CodePage 1252) string into a Unicode buffer // - very fast, by using a fixed pre-calculated array for individual chars conversion // - text will be truncated if necessary to avoid buffer overflow in Dest[] procedure WinAnsiToUnicodeBuffer(const S: WinAnsiString; Dest: PWordArray; DestLen: PtrInt); {$ifdef HASINLINE}inline;{$endif} /// direct conversion of a UTF-8 encoded string into a WinAnsi String function Utf8ToWinAnsi(const S: RawUtf8): WinAnsiString; overload; {$ifdef HASINLINE}inline;{$endif} /// direct conversion of a UTF-8 encoded zero terminated buffer into a WinAnsi String function Utf8ToWinAnsi(P: PUtf8Char): WinAnsiString; overload; {$ifdef HASINLINE}inline;{$endif} /// direct conversion of a UTF-8 encoded zero terminated buffer into a RawUtf8 String procedure Utf8ToRawUtf8(P: PUtf8Char; var result: RawUtf8); {$ifdef HASINLINE}inline;{$endif} /// direct conversion of a UTF-8 encoded buffer into a WinAnsi PAnsiChar buffer function Utf8ToWinPChar(dest: PAnsiChar; source: PUtf8Char; count: integer): integer; {$ifdef HASINLINE}inline;{$endif} {$ifndef PUREMORMOT2} /// direct conversion of a WinAnsi (CodePage 1252) string into a Unicode encoded String // - very fast, by using a fixed pre-calculated array for individual chars conversion function WinAnsiToRawUnicode(const S: WinAnsiString): RawUnicode; /// convert a UTF-16 string into a WinAnsi (code page 1252) string function RawUnicodeToWinAnsi(const Unicode: RawUnicode): WinAnsiString; overload; {$ifdef HASINLINE}inline;{$endif} /// convert a UTF-8 encoded buffer into a RawUnicode string // - if L is 0, L is computed from zero terminated P buffer // - RawUnicode is ended by a WideChar(#0) // - faster than System.Utf8Decode() which uses slow widestrings function Utf8DecodeToRawUnicode(P: PUtf8Char; L: integer): RawUnicode; overload; /// convert a UTF-8 string into a RawUnicode string function Utf8DecodeToRawUnicode(const S: RawUtf8): RawUnicode; overload; {$ifdef HASINLINE}inline;{$endif} /// convert a UTF-8 string into a RawUnicode string // - this version doesn't resize the length of the result RawUnicode // and is therefore useful before a Win32 Unicode API call (with nCount=-1) // - if DestLen is not nil, the resulting length (in bytes) will be stored within // - see also Utf8DecodeToUnicode() which uses a TSynTempBuffer for storage function Utf8DecodeToRawUnicodeUI(const S: RawUtf8; DestLen: PInteger = nil): RawUnicode; overload; /// convert a UTF-8 string into a RawUnicode string // - returns the resulting length (in bytes) will be stored within Dest // - see also Utf8DecodeToUnicode() which uses a TSynTempBuffer for storage function Utf8DecodeToRawUnicodeUI(const S: RawUtf8; var Dest: RawUnicode): integer; overload; /// convert a RawUnicode string into a UTF-8 string function RawUnicodeToUtf8(const Unicode: RawUnicode): RawUtf8; overload; /// convert any RawUnicode String into a generic SynUnicode Text function RawUnicodeToSynUnicode(const Unicode: RawUnicode): SynUnicode; overload; {$ifdef HASINLINE}inline;{$endif} /// convert any RTL string into a RawUnicode encoded String // - it's prefered to use TLanguageFile.StringToUtf8() method in mORMoti18n, // which will handle full i18n of your application // - it will work as is with Delphi 2009+ (direct unicode conversion) // - under older version of Delphi (no unicode), it will use the // current RTL codepage, as with WideString conversion (but without slow // WideString usage) function StringToRawUnicode(const S: string): RawUnicode; overload; /// convert any RTL string into a RawUnicode encoded String // - it's prefered to use TLanguageFile.StringToUtf8() method in mORMoti18n, // which will handle full i18n of your application // - it will work as is with Delphi 2009+ (direct unicode conversion) // - under older version of Delphi (no unicode), it will use the // current RTL codepage, as with WideString conversion (but without slow // WideString usage) function StringToRawUnicode(P: PChar; L: integer): RawUnicode; overload; /// convert any RawUnicode encoded string into a RTL string // - uses StrLenW() and not length(U) to handle case when was used as buffer function RawUnicodeToString(const U: RawUnicode): string; overload; {$endif PUREMORMOT2} /// convert a SynUnicode string into a UTF-8 string function SynUnicodeToUtf8(const Unicode: SynUnicode): RawUtf8; /// convert a WideString into a UTF-8 string function WideStringToUtf8(const aText: WideString): RawUtf8; {$ifdef HASINLINE}inline;{$endif} /// direct conversion of a UTF-16 encoded buffer into a WinAnsi PAnsiChar buffer procedure RawUnicodeToWinPChar(dest: PAnsiChar; source: PWideChar; WideCharCount: integer); {$ifdef HASINLINE}inline;{$endif} /// convert a UTF-16 PWideChar buffer into a WinAnsi (code page 1252) string function RawUnicodeToWinAnsi( WideChar: PWideChar; WideCharCount: integer): WinAnsiString; overload; {$ifdef HASINLINE}inline;{$endif} /// convert a WideString into a WinAnsi (code page 1252) string function WideStringToWinAnsi(const Wide: WideString): WinAnsiString; {$ifdef HASINLINE}inline;{$endif} /// convert any UTF-16 buffer into a generic SynUnicode Text function RawUnicodeToSynUnicode( WideChar: PWideChar; WideCharCount: integer): SynUnicode; overload; {$ifdef HASINLINE}inline;{$endif} /// convert an Unicode buffer into a WinAnsi (code page 1252) string procedure UnicodeBufferToWinAnsi(source: PWideChar; out Dest: WinAnsiString); /// convert an Unicode buffer into a RTL string function UnicodeBufferToString(source: PWideChar): string; /// convert an Unicode buffer into a UTF-8 string function UnicodeBufferToUtf8(source: PWideChar): RawUtf8; {$ifdef HASINLINE} inline; {$endif} /// convert an Unicode buffer into a variant storing a UTF-8 string // - could be used e.g. as TDocVariantData.AddValue() parameter function UnicodeBufferToVariant(source: PWideChar): variant; /// convert any RTL string into a variant storing a UTF-8 string // - could be used e.g. as TDocVariantData.AddValue() parameter function StringToVariant(const Txt: string): variant; overload; /// convert any RTL string into a variant storing a UTF-8 string // - could be used e.g. as TDocVariantData.AddValue() parameter procedure StringToVariant(const Txt: string; var result: variant); overload; {$ifdef HASVARUSTRING} /// convert a Delphi 2009+ or FPC Unicode string into our UTF-8 string function UnicodeStringToUtf8(const S: UnicodeString): RawUtf8; inline; // this function is the same as direct RawUtf8=AnsiString(CP_UTF8) assignment // but is faster, since it uses no Win32 API call function Utf8DecodeToUnicodeString(const S: RawUtf8): UnicodeString; overload; inline; /// convert an UTF-8 encoded buffer into a Delphi 2009+ or FPC Unicode string // - this function is the same as direct assignment, since RawUtf8=AnsiString(CP_UTF8), // but is faster, since use no Win32 API call procedure Utf8DecodeToUnicodeString(P: PUtf8Char; L: integer; var result: UnicodeString); overload; /// convert a Delphi 2009+ or FPC Unicode string into a WinAnsi (code page 1252) string function UnicodeStringToWinAnsi(const S: UnicodeString): WinAnsiString; inline; /// convert our UTF-8 encoded buffer into a Delphi 2009+ or FPC Unicode string // - this function is the same as direct assignment, since RawUtf8=AnsiString(CP_UTF8), // but is faster, since use no Win32 API call function Utf8DecodeToUnicodeString(P: PUtf8Char; L: integer): UnicodeString; overload; inline; /// convert a Win-Ansi encoded buffer into a Delphi 2009+ or FPC Unicode string // - this function is faster than default RTL, since use no Win32 API call function WinAnsiToUnicodeString(WinAnsi: PAnsiChar; WinAnsiLen: PtrInt): UnicodeString; overload; /// convert a Win-Ansi string into a Delphi 2009+ or FPC Unicode string // - this function is faster than default RTL, since use no Win32 API call function WinAnsiToUnicodeString(const WinAnsi: WinAnsiString): UnicodeString; inline; overload; {$endif HASVARUSTRING} /// convert an UTF-8 encoded buffer into a UTF-16 encoded RawByteString buffer // - could be used instead of deprecated RawUnicode when a temp UTF-16 buffer is needed function Utf8DecodeToUnicodeRawByteString(P: PUtf8Char; L: integer): RawByteString; overload; /// convert an UTF-8 encoded buffer into a UTF-16 encoded RawByteString buffer // - could be used instead of deprecated RawUnicode when a temp UTF-16 buffer is needed function Utf8DecodeToUnicodeRawByteString(const U: RawUtf8): RawByteString; overload; /// convert an UTF-8 encoded buffer into a UTF-16 encoded stream of bytes function Utf8DecodeToUnicodeStream(P: PUtf8Char; L: integer): TStream; /// convert a Win-Ansi encoded buffer into a Delphi 2009+ or FPC Unicode string // - this function is faster than default RTL, since use no Win32 API call function WinAnsiToSynUnicode(WinAnsi: PAnsiChar; WinAnsiLen: PtrInt): SynUnicode; overload; /// convert a Win-Ansi string into a Delphi 2009+ or FPC Unicode string // - this function is faster than default RTL, since use no Win32 API call function WinAnsiToSynUnicode(const WinAnsi: WinAnsiString): SynUnicode; {$ifdef HASINLINE}inline;{$endif} overload; /// convert any RTL string into an UTF-8 encoded String // - in the VCL context, it's prefered to use TLanguageFile.StringToUtf8() // method from mORMoti18n, which will handle full i18n of your application // - it will work as is with Delphi 2009+ (direct unicode conversion) // - under older version of Delphi (no unicode), it will use the // current RTL codepage, as with WideString conversion (but without slow // WideString usage) function StringToUtf8(const Text: string): RawUtf8; overload; {$ifdef HASINLINE}inline;{$endif} /// convert any RTL string buffer into an UTF-8 encoded String // - it will work as is with Delphi 2009+ (direct unicode conversion) // - under older version of Delphi (no unicode), it will use the // current RTL codepage, as with WideString conversion (but without slow // WideString usage) procedure StringToUtf8(Text: PChar; TextLen: PtrInt; var result: RawUtf8); overload; {$ifdef HASINLINE}inline;{$endif} /// convert any RTL string into an UTF-8 encoded String // - this overloaded function use a faster by-reference parameter for the result procedure StringToUtf8(const Text: string; var result: RawUtf8); overload; {$ifdef HASINLINE}inline;{$endif} /// convert any RTL string into an UTF-8 encoded String function ToUtf8(const Text: string): RawUtf8; overload; {$ifdef HASINLINE}inline;{$endif} /// convert any RTL string into an UTF-8 encoded TSynTempBuffer // - returns the number of UTF-8 bytes available in Temp.buf // - this overloaded function use a TSynTempBuffer for the result to avoid any // memory allocation for the shorter content // - caller should call Temp.Done to release any heap-allocated memory function StringToUtf8(const Text: string; var Temp: TSynTempBuffer): integer; overload; /// convert any Ansi memory buffer into UTF-8, using a TSynTempBuffer if needed // - caller should release any memory by calling Temp.Done // - returns a pointer to the UTF-8 converted buffer - which may be buf function AnsiBufferToTempUtf8(var Temp: TSynTempBuffer; Buf: PAnsiChar; BufLen, CodePage: cardinal): PUtf8Char; /// convert any UTF-8 encoded ShortString Text into an UTF-8 encoded String // - expects the supplied content to be already ASCII-7 or UTF-8 encoded, e.g. // a RTTI type or property name: it won't work with Ansi-encoded strings function ToUtf8(const Ansi7Text: ShortString): RawUtf8; overload; {$ifdef HASINLINE}inline;{$endif} /// convert any RTL string buffer into an UTF-8 encoded buffer // - Dest must be able to receive at least SourceChars*3 bytes // - it will work as is with Delphi 2009+ (direct unicode conversion) // - under older version of Delphi (no unicode), it will use the // current RTL codepage, as with WideString conversion (but without slow // WideString usage) function StringBufferToUtf8(Dest: PUtf8Char; Source: PChar; SourceChars: PtrInt): PUtf8Char; overload; /// convert any RTL string 0-terminated Text buffer into an UTF-8 string // - it will work as is with Delphi 2009+ (direct unicode conversion) // - under older version of Delphi (no unicode), it will use the // current RTL codepage, as with WideString conversion (but without slow // WideString usage) procedure StringBufferToUtf8(Source: PChar; out result: RawUtf8); overload; /// convert any RTL string into a SynUnicode encoded String // - it's prefered to use TLanguageFile.StringToUtf8() method in mORMoti18n, // which will handle full i18n of your application // - it will work as is with Delphi 2009+ (direct unicode conversion) // - under older version of Delphi (no unicode), it will use the // current RTL codepage, as with WideString conversion (but without slow // WideString usage) function StringToSynUnicode(const S: string): SynUnicode; overload; {$ifdef HASINLINE}inline;{$endif} /// convert any RTL string into a SynUnicode encoded String // - overloaded to avoid a copy to a temporary result string of a function procedure StringToSynUnicode(const S: string; var result: SynUnicode); overload; {$ifdef HASINLINE}inline;{$endif} /// convert any UTF-16 encoded buffer into a RTL string function RawUnicodeToString(P: PWideChar; L: integer): string; overload; /// convert any UTF-16 encoded buffer into a RTL string procedure RawUnicodeToString(P: PWideChar; L: integer; var result: string); overload; /// convert any SynUnicode encoded string into a RTL string function SynUnicodeToString(const U: SynUnicode): string; {$ifdef HASINLINE}inline;{$endif} /// convert any UTF-8 encoded String into a RTL string // - it's prefered to use TLanguageFile.Utf8ToString() in mORMoti18n, // which will handle full i18n of your application // - it will work as is with Delphi 2009+ (direct unicode conversion) // - under older version of Delphi (no unicode), it will use the // current RTL codepage, as with WideString conversion (but without slow // WideString usage) function Utf8ToString(const Text: RawUtf8): string; {$ifdef HASINLINE}inline;{$endif} /// convert any UTF-8 encoded String into a RTL string procedure Utf8ToStringVar(const Text: RawUtf8; var result: string); {$ifdef HASINLINE}inline;{$endif} /// convert any UTF-8 encoded String into a generic RTL file name string procedure Utf8ToFileName(const Text: RawUtf8; var result: TFileName); {$ifdef HASINLINE}inline;{$endif} /// convert any UTF-8 encoded buffer into a RTL string // - it's prefered to use TLanguageFile.Utf8ToString() in mORMoti18n, // which will handle full i18n of your application // - it will work as is with Delphi 2009+ (direct unicode conversion) // - under older version of Delphi (no unicode), it will use the // current RTL codepage, as with WideString conversion (but without slow // WideString usage) function Utf8DecodeToString(P: PUtf8Char; L: integer): string; overload; {$ifdef UNICODE}inline;{$endif} /// convert any UTF-8 encoded buffer into a RTL string procedure Utf8DecodeToString(P: PUtf8Char; L: integer; var result: string); overload; /// convert any UTF-8 encoded String into a generic WideString Text function Utf8ToWideString(const Text: RawUtf8): WideString; overload; {$ifdef HASINLINE}inline;{$endif} /// convert any UTF-8 encoded String into a generic WideString Text procedure Utf8ToWideString(const Text: RawUtf8; var result: WideString); overload; {$ifdef HASINLINE}inline;{$endif} /// convert any UTF-8 encoded String into a generic WideString Text procedure Utf8ToWideString(Text: PUtf8Char; Len: PtrInt; var result: WideString); overload; /// convert any UTF-8 encoded String into a generic SynUnicode Text function Utf8ToSynUnicode(const Text: RawUtf8): SynUnicode; overload; /// convert any UTF-8 encoded String into a generic SynUnicode Text procedure Utf8ToSynUnicode(const Text: RawUtf8; var result: SynUnicode); overload; /// convert any UTF-8 encoded buffer into a generic SynUnicode Text procedure Utf8ToSynUnicode(Text: PUtf8Char; Len: PtrInt; var result: SynUnicode); overload; /// convert any UTF-8 encoded string into an UTF-16 temporary buffer // - returns the number of WideChar stored in temp (not bytes) // - caller should make temp.Done after temp.buf has been used function Utf8DecodeToUnicode(const Text: RawUtf8; var temp: TSynTempBuffer): PtrInt; overload; {$ifdef HASINLINE}inline;{$endif} /// convert any UTF-8 encoded buffer into an UTF-16 temporary buffer function Utf8DecodeToUnicode(Text: PUtf8Char; Len: PtrInt; var temp: TSynTempBuffer): PtrInt; overload; /// convert any Ansi 7-bit encoded String into a RTL string // - the Text content must contain only 7-bit pure ASCII characters function Ansi7ToString(const Text: RawByteString): string; overload; {$ifndef UNICODE}{$ifdef HASINLINE}inline;{$endif}{$endif} /// convert any Ansi 7-bit encoded String into a RTL string // - the Text content must contain only 7-bit pure ASCII characters function Ansi7ToString(Text: PWinAnsiChar; Len: PtrInt): string; overload; {$ifdef HASINLINE}inline;{$endif} /// convert any Ansi 7-bit encoded String into a RTL string // - the Text content must contain only 7-bit pure ASCII characters procedure Ansi7ToString(Text: PWinAnsiChar; Len: PtrInt; var result: string); overload; /// convert any RTL string into Ansi 7-bit encoded String // - the Text content must contain only 7-bit pure ASCII characters function StringToAnsi7(const Text: string): RawByteString; {$ifndef UNICODE}{$ifdef HASINLINE}inline;{$endif}{$endif} /// convert any RTL string into WinAnsi (Win-1252) 8-bit encoded String function StringToWinAnsi(const Text: string): WinAnsiString; {$ifdef UNICODE}inline;{$endif} { **************** Text Case-(in)sensitive Conversion and Comparison } type /// lookup table used for fast case conversion TNormTable = packed array[AnsiChar] of AnsiChar; /// pointer to a lookup table used for fast case conversion PNormTable = ^TNormTable; /// lookup table used for fast case conversion TNormTableByte = packed array[byte] of byte; /// pointer to a lookup table used for fast case conversion PNormTableByte = ^TNormTableByte; var /// lookup table used for fast case conversion to uppercase // - handle 8-bit upper chars as in WinAnsi / code page 1252 (e.g. accents) // - is defined globally, since may be used from an inlined function NormToUpper: TNormTable; NormToUpperByte: TNormTableByte absolute NormToUpper; /// lookup table used for fast case conversion to lowercase // - handle 8-bit upper chars as in WinAnsi / code page 1252 (e.g. accents) // - is defined globally, since may be used from an inlined function NormToLower: TNormTable; NormToLowerByte: TNormTableByte absolute NormToLower; /// this table will convert 'a'..'z' into 'A'..'Z' // - so it will work with UTF-8 without decoding, whereas NormToUpper[] expects // WinAnsi encoding NormToUpperAnsi7: TNormTable; NormToUpperAnsi7Byte: TNormTableByte absolute NormToUpperAnsi7; /// this table will convert 'A'..'Z' into 'a'..'z' // - so it will work with UTF-8 without decoding, whereas NormToUpper[] expects // WinAnsi encoding NormToLowerAnsi7: TNormTable; NormToLowerAnsi7Byte: TNormTableByte absolute NormToLowerAnsi7; /// case sensitive NormToUpper[]/NormToLower[]-like table // - i.e. NormToNorm[c] = c NormToNorm: TNormTable; NormToNormByte: TNormTableByte absolute NormToNorm; const NORM2CASE: array[boolean] of PNormTable = (nil, @NormToUpperAnsi7); type /// character categories for text linefeed/word/identifier/uri parsing // - using such a set compiles into TEST [MEM], IMM so is more efficient // than a regular set of AnsiChar which generates much slower BT [MEM], IMM // - the same 256-byte memory will also be reused from L1 CPU cache // during the parsing of complex input TTextChar = set of ( tcNot01013, tc1013, tcCtrlNotLF, tcCtrlNot0Comma, tcWord, tcIdentifierFirstChar, tcIdentifier, tcUriUnreserved); /// defines an AnsiChar lookup table used for branch-less text parsing TTextCharSet = array[AnsiChar] of TTextChar; /// points to an AnsiChar lookup table used for branch-less text parsing PTextCharSet = ^TTextCharSet; /// defines an Ordinal lookup table used for branch-less text parsing TTextByteSet = array[byte] of TTextChar; /// points to an Ordinal lookup table used for branch-less text parsing PTextByteSet = ^TTextByteSet; var /// lookup table for text linefeed/word/identifier/uri branch-less parsing TEXT_CHARS: TTextCharSet; TEXT_BYTES: TTextByteSet absolute TEXT_CHARS; /// returns TRUE if the given text buffer contains a..z,A..Z,0..9,_ characters // - should match most usual property names values or other identifier names // in the business logic source code // - i.e. can be tested via IdemPropName*() functions, and the MongoDB-like // extended JSON syntax as generated by dvoSerializeAsExtendedJson // - following classic pascal naming convention, first char must be alphabetical // or '_' (i.e. not a digit), following chars can be alphanumerical or '_' function PropNameValid(P: PUtf8Char): boolean; /// returns TRUE if the given text buffers contains A..Z,0..9,_ characters // - use it with property names values (i.e. only including A..Z,0..9,_ chars) // - this function allows numbers as first char, so won't check the first char // the same way than PropNameValid() which refuses digits as pascal convention function PropNamesValid(const Values: array of RawUtf8): boolean; /// try to generate a PropNameValid() output from an incoming text // - will trim all spaces, and replace most special chars by '_' // - if it is not PropNameValid() after those replacements, will return fallback function PropNameSanitize(const text, fallback: RawUtf8): RawUtf8; /// case insensitive comparison of ASCII 7-bit identifiers // - use it with property names values (i.e. only including A..Z,0..9,_ chars) // - behavior is undefined with UTF-8 encoding (some false positive may occur) function IdemPropName(const P1, P2: ShortString): boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// case insensitive comparison of ASCII 7-bit identifiers // - use it with property names values (i.e. only including A..Z,0..9,_ chars) // - behavior is undefined with UTF-8 encoding (some false positive may occur) function IdemPropName(const P1: ShortString; P2: PUtf8Char; P2Len: PtrInt): boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// case insensitive comparison of ASCII 7-bit identifiers // - use it with property names values (i.e. only including A..Z,0..9,_ chars) // - behavior is undefined with UTF-8 encoding (some false positive may occur) // - this version expects P1 and P2 to be a PAnsiChar with specified lengths function IdemPropName(P1, P2: PUtf8Char; P1Len, P2Len: PtrInt): boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// case insensitive comparison of ASCII 7-bit identifiers // - use it with property names values (i.e. only including A..Z,0..9,_ chars) // - behavior is undefined with UTF-8 encoding (some false positive may occur) // - this version expects P2 to be a PAnsiChar with specified length function IdemPropNameU(const P1: RawUtf8; P2: PUtf8Char; P2Len: PtrInt): boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// case insensitive comparison of ASCII 7-bit identifiers of same length // - use it with property names values (i.e. only including A..Z,0..9,_ chars) // - behavior is undefined with UTF-8 encoding (some false positive may occur) // - this version expects P1 and P2 to be a PAnsiChar with an already checked // identical length, so may be used for a faster process, e.g. in a loop // - if P1 and P2 are RawUtf8, you should better call overloaded function // IdemPropNameU(const P1,P2: RawUtf8), which would be slightly faster by // using the length stored before the actual text buffer of each RawUtf8 function IdemPropNameUSameLenNotNull(P1, P2: PUtf8Char; P1P2Len: PtrInt): boolean; {$ifdef FPC}inline;{$endif} // Delphi does not like to inline goto type TIdemPropNameUSameLen = function(P1, P2: pointer; P1P2Len: PtrInt): boolean; var /// case (in)sensitive comparison of ASCII 7-bit identifiers of same length IdemPropNameUSameLen: array[{casesensitive=}boolean] of TIdemPropNameUSameLen; /// case insensitive comparison of ASCII 7-bit identifiers // - use it with property names values (i.e. only including A..Z,0..9,_ chars) // - behavior is undefined with UTF-8 encoding (some false positive may occur) // - is an alternative with PropNameEquals() to be used inlined e.g. in a loop function IdemPropNameU(const P1, P2: RawUtf8): boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// returns true if the beginning of p^ is the same as up^ // - ignore case - up^ must be already Upper // - chars are compared as 7-bit Ansi only (no accentuated characters): but when // you only need to search for field names e.g. IdemPChar() is prefered, because // it'll be faster than IdemPCharU(), if UTF-8 decoding is not mandatory // - if p is nil, will return FALSE // - if up is nil, will return TRUE function IdemPChar(p: PUtf8Char; up: PAnsiChar): boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// returns true if the beginning of p^ is the same as up^ // - this overloaded function accept the uppercase lookup buffer as parameter function IdemPChar(p: PUtf8Char; up: PAnsiChar; table: PNormTable): boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// returns true if the beginning of p^ is the same as up^, ignoring white spaces // - ignore case - up^ must be already Upper // - any white space in the input p^ buffer is just ignored // - chars are compared as 7-bit Ansi only (no accentuated characters): but when // you only need to search for field names e.g. IdemPChar() is prefered, because // it'll be faster than IdemPCharU(), if UTF-8 decoding is not mandatory // - if p is nil, will return FALSE // - if up is nil, will return TRUE function IdemPCharWithoutWhiteSpace(p: PUtf8Char; up: PAnsiChar): boolean; /// returns the index of a matching beginning of p^ in upArray[] // - returns -1 if no item matched // - ignore case - upArray^ must be already Upper // - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8) // - warning: this function expects upArray[] items to have AT LEAST TWO // CHARS (it will use a fast 16-bit comparison of initial 2 bytes) // - consider IdemPPChar() which is faster but a bit more verbose function IdemPCharArray(p: PUtf8Char; const upArray: array of PAnsiChar): integer; /// returns the index of a matching beginning of p^ in nil-terminated up^ array // - returns -1 if no item matched // - ignore case - each up^ must be already Upper // - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8) // - warning: this function expects up^ items to have AT LEAST TWO CHARS // (it will use a fast 16-bit comparison of initial 2 bytes) function IdemPPChar(p: PUtf8Char; up: PPAnsiChar): PtrInt; /// returns the index of a matching beginning of p^ in upArray two characters // - returns -1 if no item matched // - ignore case - upArray^ must be already Upper // - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8) function IdemPCharArrayBy2(p: PUtf8Char; const upArrayBy2Chars: RawUtf8): PtrInt; {$ifdef HASINLINE}inline;{$endif} /// returns true if the beginning of p^ is the same as up^ // - ignore case - up^ must be already Upper // - this version will decode the UTF-8 content before using NormToUpper[], so // it will be slower than the IdemPChar() function above, but will handle // WinAnsi accentuated characters (e.g. 'e' acute will be matched as 'E') function IdemPCharU(p, up: PUtf8Char): boolean; {$ifdef HASINLINE}inline;{$endif} /// returns true if the beginning of p^ is same as up^ // - ignore case - up^ must be already Upper // - this version expects p^ to point to an Unicode char array function IdemPCharW(p: PWideChar; up: PUtf8Char): boolean; /// check case-insensitive matching starting of text in upTextStart // - returns true if the item matched // - ignore case - upTextStart must be already in upper case // - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8) // - see StartWithExact() from mormot.core.text for a case-sensitive version function StartWith(const text, upTextStart: RawUtf8): boolean; /// check case-insensitive matching ending of text in upTextEnd // - returns true if the item matched // - ignore case - upTextEnd must be already in upper case // - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8) // - see EndWithExact() from mormot.core.text for a case-sensitive version function EndWith(const text, upTextEnd: RawUtf8): boolean; /// returns the index of a case-insensitive matching ending of p^ in upArray[] // - returns -1 if no item matched // - ignore case - upArray[] items must be already in upper case // - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8) function EndWithArray(const text: RawUtf8; const upArray: array of RawUtf8): integer; /// returns true if the file name extension contained in p^ is the same same as extup^ // - ignore case - extup^ must be already Upper // - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8) // - could be used e.g. like IdemFileExt(aFileName,'.JP'); function IdemFileExt(p: PUtf8Char; extup: PAnsiChar; sepChar: AnsiChar = '.'): boolean; /// returns matching file name extension index as extup^ // - ignore case - extup[] must be already Upper // - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8) // - could be used e.g. like IdemFileExts(aFileName,['.PAS','.INC']); function IdemFileExts(p: PUtf8Char; const extup: array of PAnsiChar; sepChar: AnsiChar = '.'): integer; /// fast retrieve the position of any value of a given set of characters // - see also strspn() function which is likely to be faster function PosCharAny(Str: PUtf8Char; Characters: PAnsiChar): PUtf8Char; /// a non case-sensitive RawUtf8 version of Pos() // - uppersubstr is expected to be already in upper case // - this version handle only 7-bit ASCII (no accentuated characters) // - see PosIU() if you want an UTF-8 version with accentuated chars support function PosI(uppersubstr: PUtf8Char; const str: RawUtf8): PtrInt; /// a non case-sensitive version of Pos() // - uppersubstr is expected to be already in upper case // - this version handle only 7-bit ASCII (no accentuated characters) function StrPosI(uppersubstr, str: PUtf8Char): PUtf8Char; /// a non case-sensitive RawUtf8 version of Pos() // - substr is expected to be already in upper case // - this version will decode the UTF-8 content before using NormToUpper[] // - see PosI() for a non-accentuated, but faster version function PosIU(substr: PUtf8Char; const str: RawUtf8): integer; /// pure pascal version of strspn(), to be used with PUtf8Char/PAnsiChar // - returns size of initial segment of s which appears in accept chars, e.g. // ! strspn('abcdef','debca')=5 // - please note that this optimized version may read up to 3 bytes beyond // accept but never after s end, so is safe e.g. over memory mapped files function strspn(s, accept: pointer): integer; /// pure pascal version of strcspn(), to be used with PUtf8Char/PAnsiChar // - returns size of initial segment of s which doesn't appears in reject chars, e.g. // ! strcspn('1234,6789',',')=4 // - please note that this optimized version may read up to 3 bytes beyond // reject but never after s end, so is safe e.g. over memory mapped files function strcspn(s, reject: pointer): integer; /// our fast version of StrCompL(), to be used with PUtf8Char // - i.e. make a binary comparison of two memory buffers, using supplied length // - Default value is returned if both P1 and P2 buffers are equal function StrCompL(P1, P2: pointer; L: PtrInt; Default: PtrInt = 0): PtrInt; {$ifdef HASINLINE}inline;{$endif} /// our fast version of StrCompIL(), to be used with PUtf8Char // - i.e. make a case-insensitive comparison of two memory buffers, using // supplied length // - Default value is returned if both P1 and P2 buffers are equal function StrCompIL(P1, P2: pointer; L: PtrInt; Default: PtrInt = 0): PtrInt; {$ifdef HASINLINE}inline;{$endif} /// our fast version of StrIComp(), to be used with PUtf8Char/PAnsiChar function StrIComp(Str1, Str2: pointer): PtrInt; {$ifdef HASINLINE}inline;{$endif} /// StrIComp-like function with a lookup table and Str1/Str2 expected not nil function StrICompNotNil(Str1, Str2: pointer; Up: PNormTableByte): PtrInt; {$ifdef HASINLINE}inline;{$endif} /// StrIComp-like function with a length, lookup table and Str1/Str2 expected not nil function StrICompLNotNil(Str1, Str2: pointer; Up: PNormTableByte; L: PtrInt): PtrInt; {$ifdef HASINLINE}inline;{$endif} /// StrIComp function with a length, lookup table and Str1/Str2 expected not nil // - returns L for whole match, or < L for a partial match function StrILNotNil(Str1, Str2: pointer; Up: PNormTableByte; L: PtrInt): PtrInt; {$ifdef HASINLINE}inline;{$endif} type /// function prototype used internally for UTF-8 buffer comparison // - also used e.g. in mormot.core.variants unit TUtf8Compare = function(P1, P2: PUtf8Char): PtrInt; var /// a quick wrapper to StrComp or StrIComp comparison functions StrCompByCase: array[{CaseInsensitive=}boolean] of TUtf8Compare; /// retrieve the next UCS4 CodePoint stored in U, then update the U pointer // - this function will decode the UTF-8 content before using NormToUpper[] // - will return '?' if the UCS4 CodePoint is higher than #255: so use this function // only if you need to deal with ASCII characters (e.g. it's used for Soundex // and for ContainsUtf8 function) function GetNextUtf8Upper(var U: PUtf8Char): Ucs4CodePoint; {$ifdef HASINLINE}inline;{$endif} /// points to the beginning of the next word stored in U // - returns nil if reached the end of U (i.e. #0 char) // - here a "word" is a Win-Ansi word, i.e. '0'..'9', 'A'..'Z' function FindNextUtf8WordBegin(U: PUtf8Char): PUtf8Char; /// return true if UpperValue (Ansi) is contained in A^ (Ansi) // - find UpperValue starting at word beginning, not inside words function FindAnsi(A, UpperValue: PAnsiChar): boolean; /// return true if UpperValue (Ansi) is contained in U^ (UTF-8 encoded) // - find UpperValue starting at word beginning, not inside words // - UTF-8 decoding is done on the fly (no temporary decoding buffer is used) function FindUtf8(U: PUtf8Char; UpperValue: PAnsiChar): boolean; /// return true if Upper (Unicode encoded) is contained in U^ (UTF-8 encoded) // - will use the slow but accurate Operating System API (Win32 or ICU) // to perform the comparison at Unicode-level // - consider using StrPosIReference() for our faster Unicode 10.0 version function FindUnicode(PW: PWideChar; Upper: PWideChar; UpperLen: PtrInt): boolean; /// return true if up^ is contained inside the UTF-8 buffer p^ // - search up^ at the beginning of every UTF-8 word (aka in Soundex) // - here a "word" is a Win-Ansi word, i.e. '0'..'9', 'A'..'Z' // - up^ must be already Upper function ContainsUtf8(p, up: PUtf8Char): boolean; /// returns TRUE if the supplied uppercased text is contained in the text buffer function GetLineContains(p, pEnd, up: PUtf8Char): boolean; {$ifdef FPC}inline;{$endif} // Delphi does not like inlining goto+label /// copy source into a 256 chars dest^ buffer with 7-bit upper case conversion // - used internally for short keys match or case-insensitive hash // - returns final dest pointer // - will copy up to 255 AnsiChar (expect the dest buffer to be defined e.g. as // array[byte] of AnsiChar on the caller stack) function UpperCopy255(dest: PAnsiChar; const source: RawUtf8): PAnsiChar; overload; {$ifdef HASINLINE}inline;{$endif} /// copy source^ into a 256 chars dest^ buffer with 7-bit upper case conversion // - used internally for short keys match or case-insensitive hash // - returns final dest pointer // - will copy up to 255 AnsiChar (expect the dest buffer to be defined e.g. as // array[byte] of AnsiChar on the caller stack) function UpperCopy255Buf(dest: PAnsiChar; source: PUtf8Char; sourceLen: PtrInt): PAnsiChar; /// copy source into dest^ with WinAnsi 8-bit upper case conversion // - used internally for short keys match or case-insensitive hash // - returns final dest pointer // - will copy up to 255 AnsiChar (expect the dest buffer to be array[byte] of // AnsiChar) function UpperCopyWin255(dest: PWinAnsiChar; const source: RawUtf8): PWinAnsiChar; /// copy UTF-16 source into dest^ with ASCII 7-bit upper case conversion // - used internally for short keys match or case-insensitive hash // - returns final dest pointer // - will copy up to 255 AnsiChar (expect the dest buffer to be array[byte] of // AnsiChar), replacing any non WinAnsi character by '?' function UpperCopy255W(dest: PAnsiChar; const source: SynUnicode): PAnsiChar; overload; {$ifdef HASINLINE}inline;{$endif} /// copy WideChar source into dest^ with upper case conversion // - used internally for short keys match or case-insensitive hash // - returns final dest pointer // - will copy up to 255 AnsiChar (expect the dest buffer to be array[byte] of // AnsiChar), replacing any non WinAnsi character by '?' function UpperCopy255W(dest: PAnsiChar; source: PWideChar; L: PtrInt): PAnsiChar; overload; /// copy source into dest^ with ASCII 7-bit upper case conversion // - returns final dest pointer // - will copy up to the source buffer end: so Dest^ should be big enough - // which will the case e.g. if Dest := pointer(source) function UpperCopy(dest: PAnsiChar; const source: RawUtf8): PAnsiChar; /// copy source into dest^ with ASCII 7-bit upper case conversion // - returns final dest pointer // - this special version expect source to be a ShortString function UpperCopyShort(dest: PAnsiChar; const source: ShortString): PAnsiChar; /// fast UTF-8 comparison handling WinAnsi CP-1252 case folding // - this version expects u1 and u2 to be zero-terminated // - decode the UTF-8 content before using NormToUpper[] lookup table // - match the our SYSTEMNOCASE custom (and default) SQLite 3 collation // - consider Utf8ICompReference() for Unicode 10.0 support function Utf8IComp(u1, u2: PUtf8Char): PtrInt; /// fast UTF-8 comparison handling WinAnsi CP-1252 case folding // - this version expects u1 and u2 not to be necessary zero-terminated, but // uses L1 and L2 as length for u1 and u2 respectively // - decode the UTF-8 content before using NormToUpper[] lookup table // - consider Utf8ILCompReference() for Unicode 10.0 support function Utf8ILComp(u1, u2: PUtf8Char; L1, L2: cardinal): PtrInt; /// copy UTF-8 buffer into dest^ handling WinAnsi CP-1252 NormToUpper[] folding // - returns the final dest pointer // - current implementation handles UTF-16 surrogates function Utf8UpperCopy(Dest, Source: PUtf8Char; SourceChars: cardinal): PUtf8Char; /// copy UTF-8 buffer into dest^ handling WinAnsi CP-1252 NormToUpper[] folding // - returns the final dest pointer // - will copy up to 255 AnsiChar (expect the dest buffer to be array[byte] of // AnsiChar), with UTF-8 encoding function Utf8UpperCopy255(dest: PAnsiChar; const source: RawUtf8): PUtf8Char; {$ifdef HASINLINE}inline;{$endif} /// fast case-insensitive Unicode comparison handling ASCII 7-bit chars // - use the NormToUpperAnsi7Byte[] array, i.e. compare 'a'..'z' as 'A'..'Z' // - this version expects u1 and u2 to be zero-terminated function AnsiICompW(u1, u2: PWideChar): PtrInt; {$ifdef HASINLINE}inline;{$endif} /// compare two "array of AnsiString" elements, with no case sensitivity // - just a wrapper around inlined StrIComp() function SortDynArrayAnsiStringI(const A, B): integer; /// compare two "array of PUtf8Char/PAnsiChar" elements, with no case sensitivity // - just a wrapper around inlined StrIComp() function SortDynArrayPUtf8CharI(const A, B): integer; /// compare two "array of RTL string" elements, with no case sensitivity // - the expected string type is the RTL string // - just a wrapper around StrIComp() for AnsiString or AnsiICompW() for UNICODE function SortDynArrayStringI(const A, B): integer; /// compare two "array of WideString/UnicodeString" elements, with no case sensitivity // - implemented here since would call AnsiICompW() function SortDynArrayUnicodeStringI(const A, B): integer; var /// a quick wrapper to SortDynArrayAnsiString or SortDynArrayAnsiStringI // comparison functions SortDynArrayAnsiStringByCase: array[{CaseInsensitive=}boolean] of TDynArraySortCompare; /// SameText() overloaded function with proper UTF-8 decoding // - fast version using NormToUpper[] array for all WinAnsi characters // - this version will decode each UTF-8 glyph before using NormToUpper[] // - current implementation handles UTF-16 surrogates as Utf8IComp() function SameTextU(const S1, S2: RawUtf8): boolean; {$ifdef HASINLINE}inline;{$endif} /// fast conversion of the supplied text into 8-bit uppercase // - this will not only convert 'a'..'z' into 'A'..'Z', but also accentuated // latin characters ('e' acute into 'E' e.g.), using NormToUpper[] array // - it will therefore decode the supplied UTF-8 content to handle more than // 7-bit of ascii characters (so this function is dedicated to WinAnsi code page // 1252 characters set) function UpperCaseU(const S: RawUtf8): RawUtf8; /// fast conversion of the supplied text into 8-bit lowercase // - this will not only convert 'A'..'Z' into 'a'..'z', but also accentuated // latin characters ('E' acute into 'e' e.g.), using NormToLower[] array // - it will therefore decode the supplied UTF-8 content to handle more than // 7-bit of ascii characters function LowerCaseU(const S: RawUtf8): RawUtf8; /// fast conversion of the supplied text into 8-bit case sensitivity // - convert the text in-place, returns the resulting length // - it will decode the supplied UTF-8 content to handle more than 7-bit // of ascii characters during the conversion (leaving not WinAnsi characters // untouched) // - will not set the last char to #0 (caller must do that if necessary) function ConvertCaseUtf8(P: PUtf8Char; const Table: TNormTableByte): PtrInt; /// check if the supplied text has some case-insentitive 'a'..'z','A'..'Z' chars // - will therefore be correct with true UTF-8 content, but only for 7-bit function IsCaseSensitive(const S: RawUtf8): boolean; overload; /// check if the supplied text has some case-insentitive 'a'..'z','A'..'Z' chars // - will therefore be correct with true UTF-8 content, but only for 7-bit function IsCaseSensitive(P: PUtf8Char; PLen: PtrInt): boolean; overload; /// low-level function called when inlining UpperCase(Copy) and LowerCase(Copy) procedure CaseCopy(Text: PUtf8Char; Len: PtrInt; Table: PNormTable; var Dest: RawUtf8); /// low-level function called when inlining UpperCaseSelf and LowerCaseSelf procedure CaseSelf(var S: RawUtf8; Table: PNormTable); /// fast conversion of the supplied text into uppercase // - this will only convert 'a'..'z' into 'A'..'Z' (no NormToUpper use), and // will therefore be correct with true UTF-8 content, but only for 7-bit function UpperCase(const S: RawUtf8): RawUtf8; {$ifdef HASINLINE} inline; {$endif} /// fast conversion of the supplied text into uppercase // - this will only convert 'a'..'z' into 'A'..'Z' (no NormToUpper use), and // will therefore be correct with true UTF-8 content, but only for 7-bit procedure UpperCaseCopy(Text: PUtf8Char; Len: PtrInt; var Dest: RawUtf8); overload; {$ifdef HASINLINE} inline; {$endif} /// fast conversion of the supplied text into uppercase // - this will only convert 'a'..'z' into 'A'..'Z' (no NormToUpper use), and // will therefore be correct with true UTF-8 content, but only for 7-bit procedure UpperCaseCopy(const Source: RawUtf8; var Dest: RawUtf8); overload; {$ifdef HASINLINE} inline; {$endif} /// fast in-place conversion of the supplied variable text into uppercase // - this will only convert 'a'..'z' into 'A'..'Z' (no NormToUpper use), and // will therefore be correct with true UTF-8 content, but only for 7-bit procedure UpperCaseSelf(var S: RawUtf8); {$ifdef HASINLINE} inline; {$endif} /// fast conversion of the supplied text into lowercase // - this will only convert 'A'..'Z' into 'a'..'z' (no NormToLower use), and // will therefore be correct with true UTF-8 content function LowerCase(const S: RawUtf8): RawUtf8; {$ifdef HASINLINE} inline; {$endif} /// fast conversion of the supplied text into lowercase // - this will only convert 'A'..'Z' into 'a'..'z' (no NormToLower use), and // will therefore be correct with true UTF-8 content procedure LowerCaseCopy(Text: PUtf8Char; Len: PtrInt; var Dest: RawUtf8); {$ifdef HASINLINE} inline; {$endif} /// fast in-place conversion of the supplied variable text into lowercase // - this will only convert 'A'..'Z' into 'a'..'z' (no NormToLower use), and // will therefore be correct with true UTF-8 content, but only for 7-bit procedure LowerCaseSelf(var S: RawUtf8); {$ifdef HASINLINE} inline; {$endif} /// accurate conversion of the supplied UTF-8 content into the corresponding // upper-case Unicode characters // - will use the available API (e.g. Win32 or ICU), so may not be consistent on // all systems - consider UpperCaseReference() to use our Unicode 10.0 tables // - will temporary decode S into and from UTF-16 so is likely to be slower function UpperCaseUnicode(const S: RawUtf8): RawUtf8; /// accurate conversion of the supplied UTF-8 content into the corresponding // lower-case Unicode characters // - will use the available API (e.g. Win32 or ICU), so may not be consistent on // all systems - and also slower than LowerCase/LowerCaseU versions function LowerCaseUnicode(const S: RawUtf8): RawUtf8; /// use the RTL to convert the SynUnicode text to UpperCase function UpperCaseSynUnicode(const S: SynUnicode): SynUnicode; /// use the RTL to convert the SynUnicode text to LowerCase function LowerCaseSynUnicode(const S: SynUnicode): SynUnicode; /// fast WinAnsi comparison using the NormToUpper[] array for all 8-bit values function AnsiIComp(Str1, Str2: pointer): PtrInt; {$ifdef HASINLINE}inline;{$endif} /// internal function used when inlining PosExI() function PosExIPas(pSub, p: PUtf8Char; Offset: PtrUInt; Lookup: PNormTable): PtrInt; /// a ASCII-7 case-insensitive version of PosEx() // - will use the NormToUpperAnsi7 lookup table for character conversion function PosExI(const SubStr, S: RawUtf8; Offset: PtrUInt): PtrInt; overload; {$ifdef HASINLINE}inline;{$endif} /// a case-insensitive version of PosEx() with a specified lookup table // - redirect to mormot.core.base PosEx() if Lookup = nil function PosExI(const SubStr, S: RawUtf8; Offset: PtrUInt; Lookup: PNormTable): PtrInt; overload; {$ifdef HASINLINE}inline;{$endif} { ************ UTF-8 String Manipulation Functions } type /// used to store a set of 8-bit encoded characters TSynAnsicharSet = set of AnsiChar; /// used to store a set of 8-bit unsigned integers TSynByteSet = set of byte; /// a generic callback, which can be used to translate some text on the fly // - maps procedure TLanguageFile.Translate(var English: string) signature // as defined in mORMoti18n.pas // - can be used e.g. for TSynMustache's {{"English text}} callback TOnStringTranslate = procedure(var English: string) of object; /// check case-sensitive matching starting of text in start // - returns true if the item matched // - see StartWith() from mormot.core.unicode for a case-insensitive version function StartWithExact(const text, textStart: RawUtf8): boolean; {$ifdef HASINLINE} inline; {$endif} /// check case-sensitive matching ending of text in ending // - returns true if the item matched // - see EndWith() from mormot.core.unicode for a case-insensitive version function EndWithExact(const text, textEnd: RawUtf8): boolean; {$ifdef HASINLINE} inline; {$endif} /// extract a line from source array of chars // - next will contain the beginning of next line, or nil if source has ended function GetNextLine(source: PUtf8Char; out next: PUtf8Char; andtrim: boolean = false): RawUtf8; /// returns n leading characters function LeftU(const S: RawUtf8; n: PtrInt): RawUtf8; {$ifdef HASINLINE} inline; {$endif} /// returns n trailing characters function RightU(const S: RawUtf8; n: PtrInt): RawUtf8; /// trims leading whitespace characters from the string by removing // new line, space, and tab characters function TrimLeft(const S: RawUtf8): RawUtf8; /// trims trailing whitespace characters from the string by removing trailing // newline, space, and tab characters function TrimRight(const S: RawUtf8): RawUtf8; /// trims leading whitespaces of every lines of the UTF-8 text // - also delete void lines // - could be used e.g. before FindNameValue() call // - modification is made in-place so S will be modified procedure TrimLeftLines(var S: RawUtf8); /// trim some trailing and ending chars // - if S is unique (RefCnt=1), will modify the RawUtf8 in place // - faster alternative to S := copy(S, Left + 1, length(S) - Left - Right) procedure TrimChars(var S: RawUtf8; Left, Right: PtrInt); /// returns the supplied text content, without any specified char // - specify a custom char set to be excluded, e.g. as [#0 .. ' '] function TrimChar(const text: RawUtf8; const exclude: TSynAnsicharSet): RawUtf8; /// returns the supplied text content, without one specified char function TrimOneChar(const text: RawUtf8; exclude: AnsiChar): RawUtf8; /// returns the supplied text content, without any other char than specified // - specify a custom char set to be included, e.g. as ['A'..'Z'] function OnlyChar(const text: RawUtf8; const only: TSynAnsicharSet): RawUtf8; /// returns the supplied text content, without any control char // - here control chars have an ASCII code in [#0 .. ' '], i.e. text[] <= ' ' function TrimControlChars(const text: RawUtf8): RawUtf8; /// split a RawUtf8 string into two strings, according to SepStr separator // - returns true and LeftStr/RightStr if they were separated by SepStr // - if SepStr is not found, LeftStr=Str and RightStr='' and returns false // - if ToUpperCase is TRUE, then LeftStr and RightStr will be made uppercase function Split(const Str, SepStr: RawUtf8; var LeftStr, RightStr: RawUtf8; ToUpperCase: boolean = false): boolean; overload; /// split a RawUtf8 string into two strings, according to SepStr separator // - this overloaded function returns the right string as function result // - if SepStr is not found, LeftStr=Str and result='' // - if ToUpperCase is TRUE, then LeftStr and result will be made uppercase function Split(const Str, SepStr: RawUtf8; var LeftStr: RawUtf8; ToUpperCase: boolean = false): RawUtf8; overload; /// split a RawUtf8 string into several strings, according to SepStr separator // - this overloaded function will fill a DestPtr[] array of PRawUtf8 // - if any DestPtr[]=nil, the item will be skipped // - if input Str end before al SepStr[] are found, DestPtr[] is set to '' // - returns the number of values extracted into DestPtr[] function Split(const Str: RawUtf8; const SepStr: array of RawUtf8; const DestPtr: array of PRawUtf8): PtrInt; overload; /// returns the last occurrence of the given SepChar separated context // - e.g. SplitRight('01/2/34','/')='34' // - if SepChar doesn't appear, will return Str, e.g. SplitRight('123','/')='123' // - if LeftStr is supplied, the RawUtf8 it points to will be filled with // the left part just before SepChar ('' if SepChar doesn't appear) function SplitRight(const Str: RawUtf8; SepChar: AnsiChar; LeftStr: PRawUtf8 = nil): RawUtf8; /// returns the last occurrence of the given SepChar separated context // - e.g. SplitRight('path/one\two/file.ext','/\')='file.ext', i.e. // SepChars='/\' will be like ExtractFileName() over RawUtf8 string // - if SepChar doesn't appear, will return Str, e.g. SplitRight('123','/')='123' function SplitRights(const Str, SepChar: RawUtf8): RawUtf8; /// check all character within text are spaces or control chars // - i.e. a faster alternative to if TrimU(text)='' then function IsVoid(const text: RawUtf8): boolean; /// fill all bytes of this memory buffer with zeros, i.e. 'toto' -> #0#0#0#0 // - will write the memory buffer directly, if this string instance is not shared // (i.e. has refcount = 1), to avoid zeroing still-used values // - may be used to cleanup stack-allocated content // ! ... finally FillZero(secret); end; procedure FillZero(var secret: RawByteString); overload; /// fill all bytes of this UTF-8 string with zeros, i.e. 'toto' -> #0#0#0#0 // - will write the memory buffer directly, if this string instance is not shared // (i.e. has refcount = 1), to avoid zeroing still-used values // - may be used to cleanup stack-allocated content // ! ... finally FillZero(secret); end; procedure FillZero(var secret: RawUtf8); overload; {$ifdef HASINLINE}inline;{$endif} /// fill all bytes of this UTF-8 string with zeros, i.e. 'toto' -> #0#0#0#0 // - SpiUtf8 type has been defined explicitly to store Sensitive Personal // Information procedure FillZero(var secret: SpiUtf8); overload; {$ifdef HASINLINE}inline;{$endif} /// fill all bytes of this dynamic array of bytes with zeros // - will write the memory buffer directly, if this array instance is not shared // (i.e. has refcount = 1), to avoid zeroing still-used values procedure FillZero(var secret: TBytes); overload; {$ifdef HASVARUSTRING} /// fill all bytes of this UTF-16 string with zeros, i.e. 'toto' -> #0#0#0#0 procedure FillZero(var secret: UnicodeString); overload; {$endif HASVARUSTRING} /// actual replacement function called by StringReplaceAll() on first match // - not to be called as such, but defined globally for proper inlining function StringReplaceAllProcess(const S, OldPattern, NewPattern: RawUtf8; found: integer; Lookup: PNormTable): RawUtf8; /// fast version of StringReplace(S, OldPattern, NewPattern, [rfReplaceAll]); function StringReplaceAll(const S, OldPattern, NewPattern: RawUtf8; Lookup: PNormTable = nil): RawUtf8; overload; /// case-sensitive (or not) StringReplace(S, OldPattern, NewPattern,[rfReplaceAll]) // - calls plain StringReplaceAll() version for CaseInsensitive = false // - calls StringReplaceAll(.., NormToUpperAnsi7) if CaseInsensitive = true function StringReplaceAll(const S, OldPattern, NewPattern: RawUtf8; CaseInsensitive: boolean): RawUtf8; overload; {$ifdef HASINLINE}inline;{$endif} /// fast version of several cascaded StringReplaceAll() function StringReplaceAll(const S: RawUtf8; const OldNewPatternPairs: array of RawUtf8; CaseInsensitive: boolean = false): RawUtf8; overload; /// fast replace of a specified char by a given string function StringReplaceChars(const Source: RawUtf8; OldChar, NewChar: AnsiChar): RawUtf8; /// fast replace of all #9 chars by a given string function StringReplaceTabs(const Source, TabText: RawUtf8): RawUtf8; /// UTF-8 dedicated (and faster) alternative to StringOfChar((Ch,Count)) function RawUtf8OfChar(Ch: AnsiChar; Count: integer): RawUtf8; /// format a text content with SQL-like quotes // - this function implements what is specified in the official SQLite3 // documentation: "A string constant is formed by enclosing the string in single // quotes ('). A single quote within the string can be encoded by putting two // single quotes in a row - as in Pascal." function QuotedStr(const S: RawUtf8; Quote: AnsiChar = ''''): RawUtf8; overload; /// format a text content with SQL-like quotes procedure QuotedStr(const S: RawUtf8; Quote: AnsiChar; var result: RawUtf8); overload; /// format a text buffer with SQL-like quotes procedure QuotedStr(P: PUtf8Char; PLen: PtrInt; Quote: AnsiChar; var result: RawUtf8); overload; /// unquote a SQL-compatible string // - the first character in P^ must be either ' or " then internal double quotes // are transformed into single quotes // - 'text '' end' -> text ' end // - "text "" end" -> text " end // - returns nil if P doesn't contain a valid SQL string // - returns a pointer just after the quoted text otherwise function UnQuoteSqlStringVar(P: PUtf8Char; out Value: RawUtf8): PUtf8Char; /// unquote a SQL-compatible string function UnQuoteSqlString(const Value: RawUtf8): RawUtf8; /// unquote a SQL-compatible symbol name // - e.g. '[symbol]' -> 'symbol' or '"symbol"' -> 'symbol' function UnQuotedSqlSymbolName(const ExternalDBSymbol: RawUtf8): RawUtf8; /// get the next character after a quoted buffer // - the first character in P^ must be either ', either " // - it will return the latest quote position, ignoring double quotes within function GotoEndOfQuotedString(P: PUtf8Char): PUtf8Char; {$ifdef HASINLINE}inline;{$endif} /// get the next character not in [#1..' '] function GotoNextNotSpace(P: PUtf8Char): PUtf8Char; {$ifdef HASINLINE}inline;{$endif} /// get the next character not in [#9,' '] function GotoNextNotSpaceSameLine(P: PUtf8Char): PUtf8Char; {$ifdef HASINLINE}inline;{$endif} /// get the next character in [#0..' '] function GotoNextSpace(P: PUtf8Char): PUtf8Char; {$ifdef HASINLINE}inline;{$endif} /// check if the next character not in [#1..' '] matchs a given value // - first ignore any non space character // - then returns TRUE if P^=ch, setting P to the character after ch // - or returns FALSE if P^<>ch, leaving P at the level of the unexpected char function NextNotSpaceCharIs(var P: PUtf8Char; ch: AnsiChar): boolean; {$ifdef HASINLINE}inline;{$endif} /// retrieve the next SQL-like identifier within the UTF-8 buffer // - will also trim any space (or line feeds) and trailing ';' // - any comment like '/*nocache*/' will be ignored // - returns true if something was set to Prop function GetNextFieldProp(var P: PUtf8Char; var Prop: RawUtf8): boolean; /// retrieve the next identifier within the UTF-8 buffer on the same line // - GetNextFieldProp() will just handle line feeds (and ';') as spaces - which // is fine e.g. for SQL, but not for regular config files with name/value pairs // - returns true if something was set to Prop function GetNextFieldPropSameLine(var P: PUtf8Char; var Prop: ShortString): boolean; /// return true if IdemPChar(source,searchUp), and go to the next line of source function IdemPCharAndGetNextLine(var source: PUtf8Char; searchUp: PAnsiChar): boolean; /// search for a value from its uppercased named entry // - i.e. iterate IdemPChar(source,UpperName) over every line of the source // - returns the text just after UpperName if it has been found at line beginning // - returns nil if UpperName was not found at any line beginning // - could be used e.g. to efficently extract a value from HTTP headers, whereas // FindIniNameValue() is tuned for [section]-oriented INI files function FindNameValue(P: PUtf8Char; UpperName: PAnsiChar): PUtf8Char; overload; /// search and returns a value from its uppercased named entry // - i.e. iterate IdemPChar(source,UpperName) over every line of the source // - returns true and the trimmed text just after UpperName into Value // if it has been found at line beginning // - returns false and set Value := '' if UpperName was not found (or leave // Value untouched if KeepNotFoundValue is true) // - could be used e.g. to efficently extract a value from HTTP headers, whereas // FindIniNameValue() is tuned for [section]-oriented INI files // - do TrimLeftLines(NameValuePairs) first if the lines start with spaces/tabs function FindNameValue(const NameValuePairs: RawUtf8; UpperName: PAnsiChar; var Value: RawUtf8; KeepNotFoundValue: boolean = false; UpperNameSeparator: AnsiChar = #0): boolean; overload; {$ifdef HASINLINE}inline;{$endif} /// search and returns a PUtf8Char value from its uppercased named entry // - as called when inlining FindNameValue() // - won't make any memory allocation, so could be fine for a quick lookup function FindNameValuePointer(NameValuePairs: PUtf8Char; UpperName: PAnsiChar; out FoundLen: PtrInt; UpperNameSeparator: AnsiChar): PUtf8Char; /// compute the line length from source array of chars // - if PEnd = nil, end counting at either #0, #13 or #10 // - otherwise, end counting at either #13 or #10 // - just a wrapper around BufferLineLength() checking PEnd=nil case function GetLineSize(P, PEnd: PUtf8Char): PtrUInt; {$ifdef HASINLINE}inline;{$endif} /// returns true if the line length from source array of chars is not less than // the specified count function GetLineSizeSmallerThan(P, PEnd: PUtf8Char; aMinimalCount: integer): boolean; {$ifndef PUREMORMOT2} /// return next string delimited with #13#10 from P, nil if no more // - this function returns a RawUnicode string type function GetNextStringLineToRawUnicode(var P: PChar): RawUnicode; {$endif PUREMORMOT2} /// trim first lowercase chars ('otDone' will return 'Done' e.g.) // - return a PUtf8Char to avoid any memory allocation function TrimLeftLowerCase(const V: RawUtf8): PUtf8Char; /// trim first lowercase chars ('otDone' will return 'Done' e.g.) // - return an RawUtf8 string: enumeration names are pure 7-bit ANSI with Delphi 7 // to 2007, and UTF-8 encoded with Delphi 2009+ function TrimLeftLowerCaseShort(V: PShortString): RawUtf8; /// trim first lowercase chars ('otDone' will return 'Done' e.g.) // - return a ShortString: enumeration names are pure 7-bit ANSI with Delphi 7 // to 2007, and UTF-8 encoded with Delphi 2009+ function TrimLeftLowerCaseToShort(V: PShortString): ShortString; overload; {$ifdef HASINLINE}inline;{$endif} /// trim first lowercase chars ('otDone' will return 'Done' e.g.) // - return a ShortString: enumeration names are pure 7-bit ANSI with Delphi 7 // to 2007, and UTF-8 encoded with Delphi 2009+ procedure TrimLeftLowerCaseToShort(V: PShortString; out result: ShortString); overload; /// fast append some UTF-8 text into a ShortString, with an ending ',' procedure AppendShortComma(text: PAnsiChar; len: PtrInt; var result: ShortString; trimlowercase: boolean); {$ifdef FPC} inline; {$endif} /// fast search of an exact case-insensitive match of a RTTI's PShortString array function FindShortStringListExact(List: PShortString; MaxValue: integer; aValue: PUtf8Char; aValueLen: PtrInt): integer; /// fast case-insensitive search of a left-trimmed lowercase match // of a RTTI's PShortString array function FindShortStringListTrimLowerCase(List: PShortString; MaxValue: integer; aValue: PUtf8Char; aValueLen: PtrInt): integer; /// fast case-sensitive search of a left-trimmed lowercase match // of a RTTI's PShortString array function FindShortStringListTrimLowerCaseExact(List: PShortString; MaxValue: integer; aValue: PUtf8Char; aValueLen: PtrInt): integer; /// convert a CamelCase string into a space separated one // - 'OnLine' will return 'On line' e.g., and 'OnMyLINE' will return 'On my LINE' // - will handle capital words at the beginning, middle or end of the text, e.g. // 'KLMFlightNumber' will return 'KLM flight number' and 'GoodBBCProgram' will // return 'Good BBC program' // - will handle a number at the beginning, middle or end of the text, e.g. // 'Email12' will return 'Email 12' // - '_' char is transformed into ' - ' // - '__' chars are transformed into ': ' // - return an RawUtf8 string: enumeration names are pure 7-bit ANSI with Delphi // up to 2007, and UTF-8 encoded with Delphi 2009+ function UnCamelCase(const S: RawUtf8): RawUtf8; overload; /// convert a CamelCase string into a space separated one // - 'OnLine' will return 'On line' e.g., and 'OnMyLINE' will return 'On my LINE' // - will handle capital words at the beginning, middle or end of the text, e.g. // 'KLMFlightNumber' will return 'KLM flight number' and 'GoodBBCProgram' will // return 'Good BBC program' // - will handle a number at the beginning, middle or end of the text, e.g. // 'Email12' will return 'Email 12' // - return the char count written into D^ // - D^ and P^ are expected to be UTF-8 encoded: enumeration and property names // are pure 7-bit ANSI with Delphi 7 to 2007, and UTF-8 encoded with Delphi 2009+ // - '_' char is transformed into ' - ' // - '__' chars are transformed into ': ' function UnCamelCase(D, P: PUtf8Char): integer; overload; /// convert a string into an human-friendly CamelCase identifier // - replacing spaces or punctuations by an uppercase character // - as such, it is not the reverse function to UnCamelCase() procedure CamelCase(P: PAnsiChar; len: PtrInt; var s: RawUtf8; const isWord: TSynByteSet = [ord('0')..ord('9'), ord('a')..ord('z'), ord('A')..ord('Z')]); overload; /// convert a string into an human-friendly CamelCase identifier // - replacing spaces or punctuations by an uppercase character // - as such, it is not the reverse function to UnCamelCase() procedure CamelCase(const text: RawUtf8; var s: RawUtf8; const isWord: TSynByteSet = [ord('0')..ord('9'), ord('a')..ord('z'), ord('A')..ord('Z')]); overload; {$ifdef HASINLINE}inline;{$endif} var /// these procedure type must be defined if a default system.pas is used // - expect generic "string" type, i.e. UnicodeString for Delphi 2009+ LoadResStringTranslate: procedure(var Text: string) = nil; /// UnCamelCase and translate a char buffer // - P is expected to be #0 ended // - return "string" type, i.e. UnicodeString for Delphi 2009+ procedure GetCaptionFromPCharLen(P: PUtf8Char; out result: string); { ************ TRawUtf8DynArray Processing Functions } /// returns TRUE if Value is nil or all supplied Values[] equal '' function IsZero(const Values: TRawUtf8DynArray): boolean; overload; /// quick helper to initialize a dynamic array of RawUtf8 from some constants // - can be used e.g. as: // ! MyArray := TRawUtf8DynArrayFrom(['a','b','c']); function TRawUtf8DynArrayFrom(const Values: array of RawUtf8): TRawUtf8DynArray; /// low-level efficient search of Value in Values[] // - CaseSensitive=false will use StrICmp() for A..Z / a..z equivalence function FindRawUtf8(Values: PRawUtf8; const Value: RawUtf8; ValuesCount: integer; CaseSensitive: boolean): integer; overload; /// return the index of Value in Values[], -1 if not found // - CaseSensitive=false will use StrICmp() for A..Z / a..z equivalence function FindRawUtf8(const Values: TRawUtf8DynArray; const Value: RawUtf8; CaseSensitive: boolean = true): integer; overload; {$ifdef HASINLINE}inline;{$endif} /// return the index of Value in Values[], -1 if not found // - CaseSensitive=false will use StrICmp() for A..Z / a..z equivalence function FindRawUtf8(const Values: array of RawUtf8; const Value: RawUtf8; CaseSensitive: boolean = true): integer; overload; /// true if Value was added successfully in Values[] function AddRawUtf8(var Values: TRawUtf8DynArray; const Value: RawUtf8; NoDuplicates: boolean = false; CaseSensitive: boolean = true): boolean; overload; /// add the Value to Values[], with an external count variable, for performance function AddRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer; const Value: RawUtf8): PtrInt; overload; /// add Value[] items to Values[] procedure AddRawUtf8(var Values: TRawUtf8DynArray; const Value: TRawUtf8DynArray); overload; /// add Value[] items to Values[], with an external count variable, for performance procedure AddRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer; const Value: TRawUtf8DynArray); overload; /// true if both TRawUtf8DynArray are the same // - comparison is case-sensitive function RawUtf8DynArrayEquals(const A, B: TRawUtf8DynArray): boolean; overload; /// true if both TRawUtf8DynArray are the same for a given number of items // - A and B are expected to have at least Count items // - comparison is case-sensitive function RawUtf8DynArrayEquals(const A, B: TRawUtf8DynArray; Count: integer): boolean; overload; /// add the Value to Values[] string array function AddString(var Values: TStringDynArray; const Value: string): PtrInt; /// convert the string dynamic array into a dynamic array of UTF-8 strings procedure StringDynArrayToRawUtf8DynArray(const Source: TStringDynArray; var result: TRawUtf8DynArray); /// convert the string list into a dynamic array of UTF-8 strings procedure StringListToRawUtf8DynArray(Source: TStringList; var result: TRawUtf8DynArray); /// retrieve the index where to insert a PUtf8Char in a sorted PUtf8Char array // - R is the last index of available entries in P^ (i.e. Count-1) // - string comparison is case-sensitive StrComp (so will work with any PAnsiChar) // - returns -1 if the specified Value was found (i.e. adding will duplicate a value) // - will use fast O(log(n)) binary search algorithm function FastLocatePUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char): PtrInt; overload; {$ifdef HASINLINE}inline;{$endif} /// retrieve the index where to insert a PUtf8Char in a sorted PUtf8Char array // - this overloaded function accept a custom comparison function for sorting // - R is the last index of available entries in P^ (i.e. Count-1) // - string comparison is case-sensitive (so will work with any PAnsiChar) // - returns -1 if the specified Value was found (i.e. adding will duplicate a value) // - will use fast O(log(n)) binary search algorithm function FastLocatePUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char; Compare: TUtf8Compare): PtrInt; overload; /// retrieve the index where is located a PUtf8Char in a sorted PUtf8Char array // - R is the last index of available entries in P^ (i.e. Count-1) // - string comparison is case-sensitive StrComp (so will work with any PAnsiChar) // - returns -1 if the specified Value was not found // - will use inlined binary search algorithm with optimized x86_64 branchless asm // - slightly faster than plain FastFindPUtf8CharSorted(P,R,Value,@StrComp) function FastFindPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char): PtrInt; overload; /// retrieve the index where is located a PUtf8Char in a sorted uppercase array // - P[] array is expected to be already uppercased // - searched Value is converted to uppercase before search via UpperCopy255Buf(), // so is expected to be short, i.e. length < 250 // - R is the last index of available entries in P^ (i.e. Count-1) // - returns -1 if the specified Value was not found // - will use fast O(log(n)) binary search algorithm // - slightly faster than plain FastFindPUtf8CharSorted(P,R,Value,@StrIComp) function FastFindUpperPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char; ValueLen: PtrInt): PtrInt; {$ifdef HASINLINE}inline;{$endif} /// retrieve the index where is located a PUtf8Char in a sorted PUtf8Char array // - R is the last index of available entries in P^ (i.e. Count-1) // - string comparison will use the specified Compare function // - returns -1 if the specified Value was not found // - will use fast O(log(n)) binary search algorithm function FastFindPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char; Compare: TUtf8Compare): PtrInt; overload; /// retrieve the index of a PUtf8Char in a PUtf8Char array via a sort indexed // - will use fast O(log(n)) binary search algorithm function FastFindIndexedPUtf8Char(P: PPUtf8CharArray; R: PtrInt; var SortedIndexes: TCardinalDynArray; Value: PUtf8Char; ItemComp: TUtf8Compare): PtrInt; /// add a RawUtf8 value in an alphaticaly sorted dynamic array of RawUtf8 // - returns the index where the Value was added successfully in Values[] // - returns -1 if the specified Value was already present in Values[] // (we must avoid any duplicate for O(log(n)) binary search) // - if CoValues is set, its content will be moved to allow inserting a new // value at CoValues[result] position - a typical usage of CoValues is to store // the corresponding ID to each RawUtf8 item // - if FastLocatePUtf8CharSorted() has been already called, this index can // be set to optional ForceIndex parameter // - by default, exact (case-sensitive) match is used; you can specify a custom // compare function if needed in Compare optional parameter function AddSortedRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer; const Value: RawUtf8; CoValues: PIntegerDynArray = nil; ForcedIndex: PtrInt = -1; Compare: TUtf8Compare = nil): PtrInt; /// delete a RawUtf8 item in a dynamic array of RawUtf8 // - if CoValues is set, the integer item at the same index is also deleted function DeleteRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer; Index: integer; CoValues: PIntegerDynArray = nil): boolean; overload; /// delete a RawUtf8 item in a dynamic array of RawUtf8; function DeleteRawUtf8(var Values: TRawUtf8DynArray; Index: PtrInt): boolean; overload; /// sort a dynamic array of RawUtf8 items // - if CoValues is set, the integer items are also synchronized // - by default, exact (case-sensitive) match is used; you can specify a custom // compare function if needed in Compare optional parameter procedure QuickSortRawUtf8(var Values: TRawUtf8DynArray; ValuesCount: integer; CoValues: PIntegerDynArray = nil; Compare: TUtf8Compare = nil); overload; /// sort a RawUtf8 array, low values first procedure QuickSortRawUtf8(Values: PRawUtf8Array; L, R: PtrInt; caseInsensitive: boolean = false); overload; {$ifdef OSPOSIX} type /// monitor a POSIX folder for all its file names, and allow efficient // case-insensitive search, as it would on a Windows file system // - will use our fast PosixFileNames() low-level API to read the names // and store them into its in-memory cache (until Flush or after FlushSeconds) TPosixFileCaseInsensitive = class protected fSafe: TRWLightLock; fFiles: TRawUtf8DynArray; fFolder: TFileName; fNextTix, fFlushSeconds: integer; fSubFolders: boolean; procedure SetFolder(const aFolder: TFileName); procedure SetSubFolders(aSubFolders: boolean); public /// initialize the file names lookup constructor Create(const aFolder: TFileName; aSubFolders: boolean); reintroduce; /// to be called on a regular pace (e.g. every second) to perform FlushSeconds procedure OnIdle(tix64: Int64); /// clear the internal list to force full reload of the directory procedure Flush; /// case-insensitive search for a given TFileName in the folder // - returns '' if not found, or the exact file name in the POSIX folder // - is thread-safe and non blocking during its lookup // - can optionally return MicroSec spent for actual filenames read on disk function Find(const aSearched: TFileName; aReadMs: PInteger = nil): TFileName; /// how many file entries are currently in the internal list function Count: PtrInt; /// make a dynamic array copy of the internal file names, sorted by StrIComp function Files: TRawUtf8DynArray; /// allow to change the monitored folder at runtime property Folder: TFileName read fFolder write SetFolder; /// define if sub-folders should also be included to the internal list property SubFolders: boolean read fSubFolders write SetSubFolders; /// after how many seconds OnIdle() should flush the internal cache // - default is 60, i.e. 1 minute // - you can set 0 to disable any auto-flush from OnIdle() property FlushSeconds: integer read fFlushSeconds write fFlushSeconds; end; {$endif OSPOSIX} { ************** Operating-System Independent Unicode Process } /// UpperCase conversion of a UTF-8 buffer using our Unicode 10.0 tables // - won't call the Operating System, so is consistent on all platforms, // whereas UpperCaseUnicode() may vary depending on each library implementation // - some codepoints enhance in length, so D^ should be at least twice than S^ // - any invalid input is replaced by UNICODE_REPLACEMENT_CHARACTER=$fffd // - won't use temporary UTF-16 decoding, and optimized for plain ASCII content function Utf8UpperReference(S, D: PUtf8Char): PUtf8Char; overload; /// UpperCase conversion of a UTF-8 buffer using our Unicode 10.0 tables // - won't call the Operating System, so is consistent on all platforms, // whereas UpperCaseUnicode() may vary depending on each library implementation // - some codepoints enhance in length, so D^ should be at least twice than S^ // - any invalid input is replaced by UNICODE_REPLACEMENT_CHARACTER=$fffd // - won't use temporary UTF-16 decoding, and optimized for plain ASCII content // - knowing the Source length, this function will handle any ASCII 7-bit input // by quad, for efficiency function Utf8UpperReference(S, D: PUtf8Char; SLen: PtrUInt): PUtf8Char; overload; /// UpperCase conversion of a UTF-8 string using our Unicode 10.0 tables // - won't call the Operating System, so is consistent on all platforms, // whereas UpperCaseUnicode() may vary depending on each library implementation // - won't use temporary UTF-16 decoding, and optimized for plain ASCII content function UpperCaseReference(const S: RawUtf8): RawUtf8; /// UTF-8 comparison using our Unicode 10.0 tables // - this version expects u1 and u2 to be zero-terminated // - Utf8IComp() handles WinAnsi CP-1252 latin accents - this one is Unicode // - won't call the Operating System, so is consistent on all platforms, and // don't require any temporary UTF-16 decoding // - has a branchless optimized process of 7-bit ASCII charset [a..z] -> [A..Z] function Utf8ICompReference(u1, u2: PUtf8Char): PtrInt; /// UTF-8 comparison using our Unicode 10.0 tables // - this version expects u1 and u2 not to be necessary zero-terminated, but // uses L1 and L2 as length for u1 and u2 respectively // - Utf8ILComp() handles WinAnsi CP-1252 latin accents - this one is Unicode // - won't call the Operating System, so is consistent on all platforms, and // don't require any temporary UTF-16 decoding // - has a branchless optimized process of 7-bit ASCII charset [a..z] -> [A..Z] function Utf8ILCompReference(u1, u2: PUtf8Char; L1, L2: integer): PtrInt; /// UpperCase conversion of UTF-8 into UCS4 using our Unicode 10.0 tables // - won't call the Operating System, so is consistent on all platforms, // whereas UpperCaseUnicode() may vary depending on each library implementation function UpperCaseUcs4Reference(const S: RawUtf8): RawUcs4; /// UTF-8 Unicode 10.0 case-insensitive Pattern search within UTF-8 buffer // - returns nil if no match, or the Pattern position found inside U^ // - Up should have been already converted using UpperCaseUcs4Reference() // - won't call the Operating System, so is consistent on all platforms, and // don't require any temporary UTF-16 decoding function StrPosIReference(U: PUtf8Char; const Up: RawUcs4): PUtf8Char; implementation { *************** UTF-8 Efficient Encoding / Decoding } { TUtf8Table } function TUtf8Table.GetHighUtf8Ucs4(var U: PUtf8Char): Ucs4CodePoint; var x, i: PtrInt; v: byte; c: PtrUInt; begin result := 0; c := byte(U^); // here c=U^>=#80 inc(U); x := Lookup[c]; if x = UTF8_INVALID then exit; i := 0; repeat v := byte(U[i]); if v and $c0 <> $80 then exit; // invalid input content c := (c shl 6) + v; inc(i); until i = x; inc(U, x); with Extra[x] do begin dec(c, offset); if c < minimum then exit; // invalid input content end; result := c; end; function GetHighUtf8Ucs4(var U: PUtf8Char): Ucs4CodePoint; begin result := UTF8_TABLE.GetHighUtf8Ucs4(U); end; function GetUtf8WideChar(P: PUtf8Char): cardinal; begin if P <> nil then begin result := byte(P^); if result > $7f then begin result := UTF8_TABLE.GetHighUtf8Ucs4(P); if result > $ffff then // surrogates can't be stored in a single UTF-16 WideChar result := UNICODE_REPLACEMENT_CHARACTER; end; end else result := PtrUInt(P); end; function NextUtf8Ucs4(var P: PUtf8Char): Ucs4CodePoint; begin if P <> nil then begin result := byte(P[0]); if result <= 127 then inc(P) else if result and $20 = 0 then begin // fast $0..$7ff process result := (result shl 6) + byte(P[1]) - $3080; inc(P, 2); end else // complex but efficient wrapper handling even UTF-16 surrogates result := UTF8_TABLE.GetHighUtf8Ucs4(P); end else result := 0; end; function Ucs4ToUtf8(ucs4: Ucs4CodePoint; Dest: PUtf8Char): PtrInt; var j: PtrInt; begin if ucs4 <= $7f then begin Dest^ := AnsiChar(ucs4); result := 1; end else if ucs4 <= $7ff then begin Dest[0] := AnsiChar($C0 or (ucs4 shr 6)); Dest[1] := AnsiChar($80 or (ucs4 and $3F)); result := 2; end else if ucs4 <= $ffff then begin Dest[0] := AnsiChar($E0 or (ucs4 shr 12)); Dest[1] := AnsiChar($80 or ((ucs4 shr 6) and $3F)); Dest[2] := AnsiChar($80 or (ucs4 and $3F)); result := 3; end else begin // here ucs4 > $ffff if ucs4 <= $1FFFFF then result := 4 else if ucs4 <= $3FFFFFF then result := 5 else result := 6; j := result - 1; repeat Dest[j] := AnsiChar((ucs4 and $3f) or $80); ucs4 := ucs4 shr 6; dec(j); until j = 0; Dest^ := AnsiChar(byte(ucs4) or UTF8_TABLE.FirstByte[result]); end; end; function Utf16CharToUtf8(Dest: PUtf8Char; var Source: PWord): integer; var c: cardinal; begin c := Source^; inc(Source); case c of 0..$7f: begin Dest^ := AnsiChar(c); result := 1; exit; end; UTF16_HISURROGATE_MIN..UTF16_HISURROGATE_MAX: begin c := ((c - UTF16_SURROGATE_OFFSET) shl 10) or (Source^ xor UTF16_LOSURROGATE_MIN); inc(Source); end; UTF16_LOSURROGATE_MIN..UTF16_LOSURROGATE_MAX: begin c := ((cardinal(Source^) - UTF16_SURROGATE_OFFSET) shl 10) or (c xor UTF16_LOSURROGATE_MIN); inc(Source); end; end; // now c is the UTF-32/UCS4 code point result := Ucs4ToUtf8(c, Dest); end; function RawUnicodeToUtf8(Dest: PUtf8Char; DestLen: PtrInt; Source: PWideChar; SourceLen: PtrInt; Flags: TCharConversionFlags): PtrInt; var c: cardinal; Tail: PWideChar; i, j: integer; label unmatch; begin result := PtrInt(Dest); inc(DestLen, PtrInt(Dest)); if (Source <> nil) and (SourceLen > 0) and (Dest <> nil) then begin // ignore any trailing BOM (do exist on Windows files) if Source^ = BOM_UTF16LE then begin inc(Source); dec(SourceLen); end; // first handle 7-bit ASCII WideChars, by pairs (Sha optimization) SourceLen := SourceLen * 2 + PtrInt(PtrUInt(Source)); Tail := PWideChar(SourceLen) - 2; if (PtrInt(PtrUInt(Dest)) < DestLen) and (Source <= Tail) then repeat c := PCardinal(Source)^; if c and $ff80ff80 <> 0 then break; // break on first non ASCII pair inc(Source, 2); c := c shr 8 or c; PWord(Dest)^ := c; inc(Dest, 2); until (Source > Tail) or (PtrInt(PtrUInt(Dest)) >= DestLen); // generic loop, handling one UCS4 CodePoint per iteration if (PtrInt(PtrUInt(Dest)) < DestLen) and (PtrInt(PtrUInt(Source)) < SourceLen) then repeat // inlined Utf16CharToUtf8() with bufferoverlow check and $FFFD on unmatch c := cardinal(Source^); inc(Source); case c of 0..$7f: begin Dest^ := AnsiChar(c); inc(Dest); if (PtrInt(PtrUInt(Dest)) < DestLen) and (PtrInt(PtrUInt(Source)) < SourceLen) then continue else break; end; UTF16_HISURROGATE_MIN..UTF16_HISURROGATE_MAX: if (PtrInt(PtrUInt(Source)) >= SourceLen) or ((cardinal(Source^) < UTF16_LOSURROGATE_MIN) or (cardinal(Source^) > UTF16_LOSURROGATE_MAX)) then begin unmatch: if (PtrInt(PtrUInt(@Dest[3])) > DestLen) or not (ccfReplacementCharacterForUnmatchedSurrogate in Flags) then break; PWord(Dest)^ := $BFEF; // UTF-8 UNICODE_REPLACEMENT_CHARACTER Dest[2] := AnsiChar($BD); inc(Dest, 3); if (PtrInt(PtrUInt(Dest)) < DestLen) and (PtrInt(PtrUInt(Source)) < SourceLen) then continue else break; end else begin c := ((c - UTF16_SURROGATE_OFFSET) shl 10) or (cardinal(Source^) xor UTF16_LOSURROGATE_MIN); inc(Source); end; UTF16_LOSURROGATE_MIN..UTF16_LOSURROGATE_MAX: if (PtrInt(PtrUInt(Source)) >= SourceLen) or ((cardinal(Source^) < UTF16_HISURROGATE_MIN) or (cardinal(Source^) > UTF16_HISURROGATE_MAX)) then goto unmatch else begin c := ((cardinal(Source^) - UTF16_SURROGATE_OFFSET) shl 10) or (c xor UTF16_LOSURROGATE_MIN); inc(Source); end; end; // now c is the UTF-32/UCS4 code point if c <= $7ff then i := 2 else if c <= $ffff then i := 3 else if c <= $1FFFFF then i := 4 else if c <= $3FFFFFF then i := 5 else i := 6; if PtrInt(PtrUInt(Dest)) + i > DestLen then break; j := i - 1; repeat Dest[j] := AnsiChar((c and $3f) or $80); c := c shr 6; dec(j); until j = 0; Dest^ := AnsiChar(byte(c) or UTF8_TABLE.FirstByte[i]); inc(Dest, i); if (PtrInt(PtrUInt(Dest)) < DestLen) and (PtrInt(PtrUInt(Source)) < SourceLen) then continue else break; until false; if not (ccfNoTrailingZero in Flags) then Dest^ := #0; end; result := PtrInt(PtrUInt(Dest)) - result; end; procedure RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer; var result: TSynTempBuffer; Flags: TCharConversionFlags); begin if (WideChar = nil) or (WideCharCount = 0) then result.Init(0) else begin result.Init(WideCharCount * 3); result.Len := RawUnicodeToUtf8( result.buf, result.len, WideChar, WideCharCount, Flags); end; end; procedure RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer; var result: RawUtf8; Flags: TCharConversionFlags); var tmp: TSynTempBuffer; begin RawUnicodeToUtf8(WideChar, WideCharCount, tmp, Flags); FastSetString(result, tmp.buf, tmp.len); tmp.Done; end; function RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer; Flags: TCharConversionFlags): RawUtf8; begin RawUnicodeToUtf8(WideChar, WideCharCount, result, Flags); end; function RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer; out Utf8Length: integer): RawUtf8; var LW: integer; begin result := ''; // somewhat faster if result is freed before any SetLength() if WideCharCount = 0 then exit; LW := WideCharCount * 3; // maximum resulting length SetLength(result, LW); Utf8Length := RawUnicodeToUtf8(pointer(result), LW + 1, WideChar, WideCharCount, [ccfNoTrailingZero]); if Utf8Length <= 0 then result := ''; end; procedure Utf8ToShortString(var dest: ShortString; source: PUtf8Char); var c: cardinal; len, extra, i: integer; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} begin {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} len := 0; if source <> nil then repeat c := byte(source^); inc(source); if c = 0 then break else if c <= 127 then begin inc(len); dest[len] := AnsiChar(c); if len < 253 then continue else break; end else begin extra := utf8.Lookup[c]; if extra = UTF8_INVALID then break; i := extra; repeat if byte(source^) and $c0 <> $80 then begin dest[0] := AnsiChar(len); exit; // invalid UTF-8 content end; c := (c shl 6) + byte(source^); inc(source); dec(i); until i = 0; dec(c, utf8.Extra[extra].offset); // #256.. -> slower but accurate conversion inc(len); if c > $ffff then dest[len] := '?' else dest[len] := AnsiChar(WinAnsiConvert.fWideToAnsi[c]); if len < 253 then continue else break; end; until false; dest[0] := AnsiChar(len); end; function Utf8ToWideChar(dest: PWideChar; source: PUtf8Char; MaxDestChars, sourceBytes: PtrInt; NoTrailingZero: boolean): PtrInt; // faster than System.Utf8ToUnicode() var c: cardinal; begd: PWideChar; endSource: PUtf8Char; endDest: PWideChar; i, extra: integer; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} label Quit, NoSource; begin result := 0; if dest = nil then exit; if source = nil then goto NoSource; if sourceBytes = 0 then begin if source^ = #0 then goto NoSource; sourceBytes := StrLen(source); end; {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} endSource := source + sourceBytes; endDest := dest + MaxDestChars; begd := dest; repeat c := byte(source^); inc(source); if c <= 127 then begin PWord(dest)^ := c; // much faster than dest^ := WideChar(c) for FPC inc(dest); if (source < endSource) and (dest < endDest) then continue else break; end; extra := utf8.Lookup[c]; if (extra = UTF8_INVALID) or (source + extra > endSource) then break; i := 0; repeat if byte(source^) and $c0 <> $80 then goto Quit; // invalid input content c := (c shl 6) + byte(source[i]); inc(i); until i = extra; inc(source, extra); with utf8.Extra[extra] do begin dec(c, offset); if c < minimum then break; // invalid input content end; if c <= $ffff then begin PWord(dest)^ := c; inc(dest); if (source < endSource) and (dest < endDest) then continue else break; end; dec(c, $10000); // store as UTF-16 surrogates PWordArray(dest)[0] := (c shr 10) or UTF16_HISURROGATE_MIN; PWordArray(dest)[1] := (c and $3FF) or UTF16_LOSURROGATE_MIN; inc(dest, 2); if (source >= endSource) or (dest >= endDest) then break; until false; Quit: result := PtrUInt(dest) - PtrUInt(begd); // dest-begd return byte length NoSource: if not NoTrailingZero then dest^ := #0; // always append a WideChar(0) to the end of the buffer end; function Utf8ToWideChar(dest: PWideChar; source: PUtf8Char; sourceBytes: PtrInt; NoTrailingZero: boolean): PtrInt; // faster than System.UTF8Decode() var c: cardinal; begd: PWideChar; endSource, endSourceBy4: PUtf8Char; i, extra: PtrInt; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} label quit, nosource, by1, by4; begin result := 0; if dest = nil then exit; if source = nil then goto nosource; if sourceBytes = 0 then begin if source^ = #0 then goto nosource; sourceBytes := StrLen(source); end; {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} begd := dest; endSource := source + sourceBytes; endSourceBy4 := endSource - 4; if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(source) and 3 = 0) and{$endif} (source <= endSourceBy4) then repeat // handle 7-bit ASCII chars, by quad c := PCardinal(source)^; if c and $80808080 <> 0 then goto by1; // break on first non ASCII quad by4: inc(source, 4); PCardinal(dest)^ := (c shl 8 or (c and $FF)) and $00ff00ff; c := c shr 16; PCardinal(dest + 2)^ := (c shl 8 or c) and $00ff00ff; inc(dest, 4); until source > endSourceBy4; if source < endSource then repeat by1: c := byte(source^); inc(source); if c <= 127 then begin PWord(dest)^ := c; // much faster than dest^ := WideChar(c) for FPC inc(dest); if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(source) and 3 = 0) and{$endif} (source <= endSourceBy4) then begin c := PCardinal(source)^; if c and $80808080 = 0 then goto by4 else continue; end; if source < endSource then continue else break; end; extra := utf8.Lookup[c]; if (extra = UTF8_INVALID) or (source + extra > endSource) then break; i := 0; repeat if byte(source^) and $c0 <> $80 then goto quit; // invalid input content c := (c shl 6) + byte(source[i]); inc(i); until i = extra; inc(source, extra); with utf8.Extra[extra] do begin dec(c, offset); if c < minimum then break; // invalid input content end; if c <= $ffff then begin PWord(dest)^ := c; inc(dest); if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(source) and 3 = 0) and{$endif} (source <= endSourceBy4) then begin c := PCardinal(source)^; if c and $80808080 = 0 then goto by4; continue; end; if source < endSource then continue else break; end; dec(c, $10000); // store as UTF-16 surrogates PWordArray(dest)[0] := (c shr 10) or UTF16_HISURROGATE_MIN; PWordArray(dest)[1] := (c and $3FF) or UTF16_LOSURROGATE_MIN; inc(dest, 2); if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(source) and 3 = 0) and{$endif} (source <= endSourceBy4) then begin c := PCardinal(source)^; if c and $80808080 = 0 then goto by4; continue; end; if source >= endSource then break; until false; quit: result := PtrUInt(dest) - PtrUInt(begd); // dest-begd returns bytes length nosource: if not NoTrailingZero then dest^ := #0; // always append a WideChar(0) to the end of the buffer end; function IsValidUtf8Pas(source: PUtf8Char; sourcelen: PtrInt): boolean; var c: byte; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} label done; begin inc(PtrUInt(sourcelen), PtrUInt(source) - 4); if source = nil then goto done; {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} repeat if PtrUInt(source) <= PtrUInt(sourcelen) then begin if utf8.Lookup[ord(source[0])] = UTF8_ASCII then if utf8.Lookup[ord(source[1])] = UTF8_ASCII then if utf8.Lookup[ord(source[2])] = UTF8_ASCII then if utf8.Lookup[ord(source[3])] = UTF8_ASCII then begin inc(source, 4); // optimized for JSON-like content continue; end else inc(source, 3) else inc(source, 2) else inc(source); end else if PtrUInt(source) >= PtrUInt(sourcelen) + 4 then break; c := utf8.Lookup[ord(source^)]; inc(source); if c = UTF8_ASCII then continue else if c >= UTF8_INVALID then // UTF8_INVALID=6, UTF8_ZERO=7 means unexpected end of input break; // c = extras -> check valid UTF-8 content repeat if byte(source^) and $c0 <> $80 then goto done; inc(source); dec(c); until c = 0; until false; done: result := PtrUInt(source) = PtrUInt(sourcelen) + 4; end; function IsValidUtf8(source: PUtf8Char): boolean; begin result := IsValidUtf8Buffer(source, StrLen(source)); end; function IsValidUtf8(const source: RawUtf8): boolean; begin result := IsValidUtf8Buffer(pointer(source), length(source)); end; procedure DetectRawUtf8(var source: RawByteString); begin {$ifdef HASCODEPAGE} // do nothing on oldest Delphi if (source <> '') and IsValidUtf8(source) then EnsureRawUtf8(source); {$endif HASCODEPAGE} end; function IsValidUtf8WithoutControlChars(source: PUtf8Char): boolean; var c: byte; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} begin {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} result := false; if source <> nil then repeat c := byte(source^); inc(source); if c <= $7f then if c < 32 then if c = 0 then break // reached end of input else exit // disallow #1..#31 control char else continue; c := utf8.Lookup[c]; if c = UTF8_INVALID then exit; // check valid UTF-8 content repeat if byte(source^) and $c0 <> $80 then exit; inc(source); dec(c); until c = 0; until false; result := true; end; function IsValidUtf8WithoutControlChars(const source: RawUtf8): boolean; var s, len: PtrInt; c: byte; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} begin {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} result := false; s := 1; len := Length(source); while s <= len do begin c := byte(source[s]); inc(s); if c < 32 then exit // disallow #0..#31 control char else if c > $7f then begin c := utf8.Lookup[c]; if c = UTF8_INVALID then exit; // check valid UTF-8 content repeat if byte(source[s]) and $c0 <> $80 then exit; inc(s); dec(c); until c = 0; end; end; result := true; end; function Utf8ToUnicodeLength(source: PUtf8Char): PtrUInt; var c: byte; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} begin {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} result := 0; if source <> nil then repeat c := utf8.Lookup[byte(source^)]; inc(source); if c = UTF8_ASCII then inc(result) else if c = UTF8_ZERO then break else if c = UTF8_INVALID then exit else begin inc(result, 1 + ord(c >= UTF8_EXTRA_SURROGATE)); // check valid UTF-8 content repeat if byte(source^) and $c0 <> $80 then exit; inc(source); dec(c); until c = 0; end; until false; end; function Utf8TruncateToUnicodeLength(var text: RawUtf8; maxUtf16: integer): boolean; var c: byte; source: PUtf8Char; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} label trunc; begin {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} source := pointer(text); if (source <> nil) and (cardinal(maxUtf16) < cardinal(Length(text))) then repeat c := utf8.Lookup[byte(source^)]; inc(source); if c = UTF8_ASCII then begin dec(maxUtf16); if maxUtf16 <> 0 then continue; trunc: SetLength(text, source - pointer(text)); result := true; exit; end else if (c = UTF8_ZERO) or (c = UTF8_INVALID) then break else begin dec(maxUtf16, 1 + ord(c >= UTF8_EXTRA_SURROGATE)); if maxUtf16 < 0 then goto trunc; // not enough place for this UTF-8 codepoint // check valid UTF-8 content repeat if byte(source^) and $c0 <> $80 then break; inc(source); dec(c); until c = 0; if maxUtf16 = 0 then goto trunc; end; until false; result := false; end; function Utf8TruncateToLength(var text: RawUtf8; maxBytes: PtrUInt): boolean; begin if PtrUInt(Length(text)) < maxBytes then begin result := false; exit; // nothing to truncate end; while (maxBytes > 0) and (ord(text[maxBytes]) and $c0 = $80) do dec(maxBytes); if (maxBytes > 0) and (text[maxBytes] > #$7f) then dec(maxBytes); SetLength(text, maxBytes); result := true; end; function Utf8TruncatedLength(const text: RawUtf8; maxBytes: PtrUInt): PtrInt; begin result := Length(text); if PtrUInt(result) < maxBytes then exit; result := maxBytes; if (result = 0) or (text[result] <= #$7f) then exit; while (result > 0) and (ord(text[result]) and $c0 = $80) do dec(result); if (result > 0) and (text[result] > #$7f) then dec(result); end; function Utf8TruncatedLength(text: PAnsiChar; textlen, maxBytes: PtrUInt): PtrInt; begin result := textlen; if textlen < maxBytes then exit; result := maxBytes; while (result > 0) and (ord(text[result]) and $c0 = $80) do dec(result); if (result > 0) and (text[result] > #$7f) then dec(result); end; function Utf8FirstLineToUtf16Length(source: PUtf8Char): PtrInt; var c: PtrUInt; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} begin {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} result := 0; if source <> nil then repeat c := byte(source^); inc(source); if c <= $7f then if byte(c) in [0, 10, 13] then break // #0, #10 or #13 stop the count else inc(result) else begin c := utf8.Lookup[c]; if c = UTF8_INVALID then exit; // invalid leading byte inc(result, 1 + ord(c >= UTF8_EXTRA_SURROGATE)); inc(source, c); // a bit less safe, but faster end; until false; end; { **************** UTF-8 / Unicode / Ansi Conversion Classes } var // internal list of TSynAnsiConvert instances SynAnsiConvertList: array of TSynAnsiConvert; SynAnsiConvertListLock: TRWLightLock; SynAnsiConvertListCount: integer; SynAnsiConvertListCodePage: TWordDynArray; // for fast lookup in CPU L1 cache { TSynAnsiConvert } function TSynAnsiConvert.AnsiBufferToUnicode(Dest: PWideChar; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PWideChar; var c: cardinal; begin // first handle trailing 7-bit ASCII chars, by quad (Sha optimization) if SourceChars >= 4 then repeat c := PCardinal(Source)^; if c and $80808080 <> 0 then break; // break on first non ASCII quad dec(SourceChars, 4); inc(Source, 4); PCardinal(Dest)^ := (c shl 8 or (c and $FF)) and $00ff00ff; c := c shr 16; PCardinal(Dest + 2)^ := (c shl 8 or c) and $00ff00ff; inc(Dest, 4); until SourceChars < 4; if (SourceChars > 0) and (ord(Source^) < 128) then repeat dec(SourceChars); PWord(Dest)^ := ord(Source^); // faster than dest^ := WideChar(c) on FPC inc(Source); inc(Dest); until (SourceChars = 0) or (ord(Source^) >= 128); if SourceChars > 0 then // rely on the Operating System for all remaining ASCII characters inc(Dest, Unicode_AnsiToWide(Source, Dest, SourceChars, SourceChars, fCodePage)); if not NoTrailingZero then Dest^ := #0; result := Dest; end; function TSynAnsiConvert.AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PUtf8Char; var tmp: TSynTempBuffer; c: cardinal; U: PWideChar; begin // first handle trailing 7-bit ASCII chars, by quad (Sha optimization) if SourceChars >= 4 then repeat c := PCardinal(Source)^; if c and $80808080 <> 0 then break; // break on first non ASCII quad PCardinal(Dest)^ := c; dec(SourceChars, 4); inc(Source, 4); inc(Dest, 4); until SourceChars < 4; if (SourceChars > 0) and (ord(Source^) < 128) then repeat Dest^ := Source^; dec(SourceChars); inc(Source); inc(Dest); until (SourceChars = 0) or (ord(Source^) >= 128); // rely on the Operating System for all remaining ASCII characters if SourceChars = 0 then result := Dest else begin U := AnsiBufferToUnicode(tmp.Init(SourceChars * 3), Source, SourceChars); result := Dest + RawUnicodeToUtf8(Dest, SourceChars * 3, tmp.buf, (PtrUInt(U) - PtrUInt(tmp.buf)) shr 1, [ccfNoTrailingZero]); tmp.Done; end; if not NoTrailingZero then result^ := #0; end; // UTF-8 is AT MOST 50% bigger than UTF-16 in bytes in range U+0800..U+FFFF // see http://stackoverflow.com/a/7008095 -> bytes=WideCharCount*3 below {$ifndef PUREMORMOT2} function TSynAnsiConvert.AnsiToRawUnicode(const AnsiText: RawByteString): RawUnicode; begin result := AnsiToRawUnicode(pointer(AnsiText), length(AnsiText)); end; function TSynAnsiConvert.AnsiToRawUnicode(Source: PAnsiChar; SourceChars: cardinal): RawUnicode; var U: PWideChar; tmp: TSynTempBuffer; begin if SourceChars = 0 then result := '' else begin U := AnsiBufferToUnicode(tmp.Init(SourceChars * 2), Source, SourceChars); U^ := #0; SetString(result, PAnsiChar(tmp.buf), PtrUInt(U) - PtrUInt(tmp.buf) + 1); tmp.Done; end; end; {$endif PUREMORMOT2} function TSynAnsiConvert.AnsiToUnicodeString(Source: PAnsiChar; SourceChars: cardinal): SynUnicode; var tmp: TSynTempBuffer; U: PWideChar; begin if SourceChars = 0 then result := '' else begin U := AnsiBufferToUnicode(tmp.Init(SourceChars * 2), Source, SourceChars); SetString(result, PWideChar(tmp.buf), (PtrUInt(U) - PtrUInt(tmp.buf)) shr 1); tmp.Done; end; end; function TSynAnsiConvert.AnsiToUnicodeString(const Source: RawByteString): SynUnicode; var tmp: TSynTempBuffer; U: PWideChar; begin if Source = '' then result := '' else begin tmp.Init(length(Source) * 2); // max dest size in bytes U := AnsiBufferToUnicode(tmp.buf, pointer(Source), length(Source)); SetString(result, PWideChar(tmp.buf), (PtrUInt(U) - PtrUInt(tmp.buf)) shr 1); tmp.Done; end; end; function TSynAnsiConvert.AnsiToUtf8(const AnsiText: RawByteString): RawUtf8; begin AnsiBufferToRawUtf8(pointer(AnsiText), length(AnsiText), result); end; procedure TSynAnsiConvert.AnsiBufferToRawUtf8(Source: PAnsiChar; SourceChars: cardinal; out Value: RawUtf8); var tmp: TSynTempBuffer; P: PUtf8Char; begin if (Source = nil) or (SourceChars = 0) then exit; P := AnsiBufferToUtf8(tmp.Init(SourceChars * 3), Source, SourceChars); FastSetString(Value, tmp.buf, P - tmp.buf); tmp.Done; end; constructor TSynAnsiConvert.Create(aCodePage: cardinal); begin fCodePage := aCodePage; fAnsiCharShift := 1; // default is safe end; function GetEngine(aCodePage: cardinal): TSynAnsiConvert; {$ifdef HASINLINE} inline; {$endif} var i: PtrInt; begin SynAnsiConvertListLock.ReadLock; // concurrent read lock i := WordScanIndex(pointer(SynAnsiConvertListCodePage), SynAnsiConvertListCount, aCodePage); // SSE2 asm on i386 and x86_64 if i >= 0 then result := SynAnsiConvertList[i] else result := nil; SynAnsiConvertListLock.ReadUnLock; end; function NewEngine(aCodePage: cardinal): TSynAnsiConvert; var i: PtrInt; begin SynAnsiConvertListLock.WriteLock; try i := WordScanIndex(pointer(SynAnsiConvertListCodePage), SynAnsiConvertListCount, aCodePage); // search again for thread safety if i >= 0 then begin result := SynAnsiConvertList[i]; // avoid any (unlikely) race condition exit; end; if aCodePage = CP_UTF8 then result := TSynAnsiUtf8.Create(CP_UTF8) else if aCodePage = CP_UTF16 then result := TSynAnsiUtf16.Create(CP_UTF16) else if IsFixedWidthCodePage(aCodePage) then result := TSynAnsiFixedWidth.Create(aCodePage) else result := TSynAnsiConvert.Create(aCodePage); RegisterGlobalShutdownRelease(result); ObjArrayAdd(SynAnsiConvertList, result); AddWord(SynAnsiConvertListCodePage, SynAnsiConvertListCount, aCodePage); finally SynAnsiConvertListLock.WriteUnLock; end; end; class function TSynAnsiConvert.Engine(aCodePage: cardinal): TSynAnsiConvert; begin if aCodePage <> CP_ACP then begin result := GetEngine(aCodePage); if result = nil then if aCodePage = CP_RAWBLOB then result := RawByteStringConvert // CP_RAWBLOB is internal -> no engine else result := NewEngine(aCodePage) end else result := CurrentAnsiConvert; end; function TSynAnsiConvert.UnicodeBufferToAnsi(Dest: PAnsiChar; Source: PWideChar; SourceChars: cardinal): PAnsiChar; var c: cardinal; begin if (Source <> nil) and (SourceChars <> 0) then begin // ignore any trailing BOM (do exist on Windows files) if Source^ = BOM_UTF16LE then begin inc(Source); dec(SourceChars); end; // first handle trailing 7-bit ASCII chars, by pairs (Sha optimization) if SourceChars >= 2 then repeat c := PCardinal(Source)^; if c and $ff80ff80 <> 0 then break; // break on first non ASCII pair dec(SourceChars, 2); inc(Source, 2); c := c shr 8 or c; PWord(Dest)^ := c; inc(Dest, 2); until SourceChars < 2; if (SourceChars > 0) and (ord(Source^) < 128) then repeat Dest^ := AnsiChar(ord(Source^)); dec(SourceChars); inc(Source); inc(Dest); until (SourceChars = 0) or (ord(Source^) >= 128); // rely on the Operating System for all remaining ASCII characters if SourceChars <> 0 then inc(Dest, Unicode_WideToAnsi(Source, Dest, SourceChars, SourceChars * 3, fCodePage)); end; result := Dest; end; function TSynAnsiConvert.Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char; SourceChars: cardinal): PAnsiChar; var tmp: TSynTempBuffer; begin if (Source = nil) or (SourceChars = 0) then result := Dest else begin tmp.Init((SourceChars + 1) shl fAnsiCharShift); result := UnicodeBufferToAnsi(Dest, tmp.buf, Utf8ToWideChar(tmp.buf, Source, SourceChars) shr 1); tmp.Done; end; end; function TSynAnsiConvert.Utf8BufferToAnsi( Source: PUtf8Char; SourceChars: cardinal): RawByteString; begin Utf8BufferToAnsi(Source, SourceChars, result); end; procedure TSynAnsiConvert.Utf8BufferToAnsi(Source: PUtf8Char; SourceChars: cardinal; var result: RawByteString); var tmp: array[word] of AnsiChar; max: PtrInt; begin if (Source = nil) or (SourceChars = 0) then result := '' else begin max := (SourceChars + 1) shl fAnsiCharShift; if max < SizeOf(tmp) then // use a temporary stack buffer up to 64KB FastSetStringCP(result, @tmp, Utf8BufferToAnsi(@tmp, Source, SourceChars) - PAnsiChar(@tmp), fCodePage) else begin // huge strings will be allocated once and truncated, not resized FastSetStringCP(result, nil, max, fCodePage); FakeLength(result, Utf8BufferToAnsi(pointer(result), Source, SourceChars) - pointer(result)); end; end; end; function TSynAnsiConvert.Utf8ToAnsi(const u: RawUtf8): RawByteString; begin Utf8BufferToAnsi(pointer(u), length(u), result); end; function TSynAnsiConvert.Utf8ToAnsiBuffer2K(const S: RawUtf8; Dest: PAnsiChar; DestSize: integer): integer; var tmp: array[0..2047] of AnsiChar; // truncated to 2KB as documented begin if (DestSize <= 0) or (Dest = nil) then begin result := 0; exit; end; result := length(S); if result > 0 then begin if result > SizeOf(tmp) then result := SizeOf(tmp); result := Utf8BufferToAnsi(tmp{%H-}, pointer(S), result) - {%H-}tmp; if result >= DestSize then result := DestSize - 1; MoveFast(tmp, Dest^, result); end; Dest[result] := #0; end; function TSynAnsiConvert.UnicodeBufferToAnsi(Source: PWideChar; SourceChars: cardinal): RawByteString; var tmp: TSynTempBuffer; begin if (Source = nil) or (SourceChars = 0) then result := '' else begin tmp.Init(SourceChars * 3); FastSetStringCP(result, tmp.buf, UnicodeBufferToAnsi( tmp.buf, Source, SourceChars) - PAnsiChar(tmp.buf), fCodePage); tmp.Done; end; end; function TSynAnsiConvert.UnicodeStringToAnsi(const Source: SynUnicode): RawByteString; begin result := UnicodeBufferToAnsi(pointer(Source), length(Source)); end; {$ifndef PUREMORMOT2} function TSynAnsiConvert.RawUnicodeToAnsi(const Source: RawUnicode): RawByteString; begin result := UnicodeBufferToAnsi(pointer(Source), length(Source) shr 1); end; {$endif PUREMORMOT2} function TSynAnsiConvert.AnsiToAnsi(From: TSynAnsiConvert; const Source: RawByteString): RawByteString; begin if From = self then result := Source else result := AnsiToAnsi(From, pointer(Source), length(Source)); end; function TSynAnsiConvert.AnsiToAnsi(From: TSynAnsiConvert; Source: PAnsiChar; SourceChars: cardinal): RawByteString; var tmp: TSynTempBuffer; U: PWideChar; begin if From.fCodePage = fCodePage then FastSetStringCP(result, Source, SourceChars, fCodePage) else if (Source = nil) or (SourceChars = 0) then result := '' else begin U := tmp.Init(SourceChars * 2 + 2); result := UnicodeBufferToAnsi(U, From.AnsiBufferToUnicode(U, Source, SourceChars) - U); tmp.Done; end; end; { TSynAnsiFixedWidth } function TSynAnsiFixedWidth.AnsiBufferToUnicode(Dest: PWideChar; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PWideChar; var i: integer; tab: PWordArray; begin // PWord*(Dest)[] is much faster than dest^ := WideChar(c) for FPC tab := pointer(fAnsiToWide); for i := 1 to SourceChars shr 2 do begin PWordArray(Dest)[0] := tab[Ord(Source[0])]; PWordArray(Dest)[1] := tab[Ord(Source[1])]; PWordArray(Dest)[2] := tab[Ord(Source[2])]; PWordArray(Dest)[3] := tab[Ord(Source[3])]; inc(Source, 4); inc(Dest, 4); end; for i := 1 to SourceChars and 3 do begin PWord(Dest)^ := tab[Ord(Source^)]; inc(Dest); inc(Source); end; if not NoTrailingZero then Dest^ := #0; result := Dest; end; function TSynAnsiFixedWidth.AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PUtf8Char; var EndSource, EndSourceBy4: PAnsiChar; c: cardinal; label by4, by1; // ugly but faster begin if (self = nil) or (Dest = nil) then begin result := nil; Exit; end else if (Source <> nil) and (SourceChars > 0) then begin // handle 7-bit ASCII WideChars, by quads EndSource := Source + SourceChars; EndSourceBy4 := EndSource - 4; if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif} (Source <= EndSourceBy4) then repeat c := PCardinal(Source)^; if c and $80808080 <> 0 then goto by1; // break on first non ASCII quad by4: inc(Source, 4); PCardinal(Dest)^ := c; inc(Dest, 4); until Source > EndSourceBy4; // generic loop, handling one WideChar per iteration if Source < EndSource then repeat by1: c := byte(Source^); inc(Source); if c <= $7F then begin Dest^ := AnsiChar(c); // 0..127 don't need any translation Inc(Dest); if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif} (Source <= EndSourceBy4) then begin c := PCardinal(Source)^; if c and $80808080 = 0 then goto by4; continue; end; if Source < EndSource then continue else break; end else begin // no surrogate is expected in TSynAnsiFixedWidth charsets c := fAnsiToWide[c]; // convert FixedAnsi char into Unicode char if c > $7ff then begin Dest[0] := AnsiChar($E0 or (c shr 12)); Dest[1] := AnsiChar($80 or ((c shr 6) and $3F)); Dest[2] := AnsiChar($80 or (c and $3F)); Inc(Dest, 3); if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif} (Source <= EndSourceBy4) then begin c := PCardinal(Source)^; if c and $80808080 = 0 then goto by4; continue; end; if Source < EndSource then continue else break; end else begin Dest[0] := AnsiChar($C0 or (c shr 6)); Dest[1] := AnsiChar($80 or (c and $3F)); Inc(Dest, 2); if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif} (Source < EndSourceBy4) then begin c := PCardinal(Source)^; if c and $80808080 = 0 then goto by4; continue; end; if Source < EndSource then continue else break; end; end; until false; end; if not NoTrailingZero then Dest^ := #0; {$ifdef ISDELPHI104} exit(Dest); // circumvent Delphi 10.4 optimizer bug {$else} result := Dest; {$endif ISDELPHI104} end; {$ifndef PUREMORMOT2} function TSynAnsiFixedWidth.AnsiToRawUnicode(Source: PAnsiChar; SourceChars: cardinal): RawUnicode; begin if SourceChars = 0 then result := '' else begin SetString(result, nil, SourceChars * 2 + 1); AnsiBufferToUnicode(pointer(result), Source, SourceChars); end; end; {$endif PUREMORMOT2} const /// reference set for WinAnsi to Unicode conversion // - this table contains all the Unicode codepoints corresponding to // the Ansi Code Page 1252 (i.e. WinAnsi), which Unicode value are > 255 // - values taken from MultiByteToWideChar(1252,0,@Tmp,256,@WinAnsiTable,256) // so are available outside the Windows platforms (e.g. Linux/BSD) and even // if the system has been tweaked as such: // http://www.fas.harvard.edu/~chgis/data/chgis/downloads/v4/howto/cyrillic.html WinAnsiUnicodeChars: packed array[128..159] of word = ( 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141, 381, 143, 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 157, 382, 376); constructor TSynAnsiFixedWidth.Create(aCodePage: cardinal); var i, len: PtrInt; A256: array[0..255] of AnsiChar; U256: array[0..255] of WideChar; begin inherited; if not IsFixedWidthCodePage(aCodePage) then // warning: CreateUtf8() uses Utf8ToString() -> call CreateFmt() here raise ESynUnicode.CreateFmt('%s.Create - Invalid code page %d', [ClassNameShort(self)^, fCodePage]); // create internal look-up tables SetLength(fAnsiToWide, 256); if (aCodePage = CP_WINANSI) or (aCodePage = CP_LATIN1) or (aCodePage >= CP_RAWBLOB) then begin // Win1252 has its own table, LATIN1 and RawByteString map 8-bit Unicode for i := 0 to 255 do fAnsiToWide[i] := i; if aCodePage = CP_WINANSI then // do not trust the Windows API for the 1252 code page :( for i := low(WinAnsiUnicodeChars) to high(WinAnsiUnicodeChars) do fAnsiToWide[i] := WinAnsiUnicodeChars[i]; end else begin // initialize table from Operating System returned values for i := 0 to 255 do A256[i] := AnsiChar(i); FillcharFast(U256, SizeOf(U256), 0); // call mormot.core.os cross-platform Unicode_AnsiToWide() len := PtrUInt(inherited AnsiBufferToUnicode(U256, A256, 256)) - PtrUInt(@U256); if (len < 500) or (len > 512) then // warning: CreateUtf8() uses Utf8ToString() -> call CreateFmt() now raise ESynUnicode.CreateFmt('OS error for %s.Create(%d) [%d]', [ClassNameShort(self)^, aCodePage, len]); MoveFast(U256[0], fAnsiToWide[0], 512); end; SetLength(fWideToAnsi, 65536); for i := 1 to 126 do fWideToAnsi[i] := i; FillcharFast(fWideToAnsi[127], 65536 - 127, ord('?')); // '?' for unknown char for i := 127 to 255 do if (fAnsiToWide[i] <> 0) and (fAnsiToWide[i] <> ord('?')) then fWideToAnsi[fAnsiToWide[i]] := i; // fixed width Ansi will never be bigger than UTF-8 fAnsiCharShift := 0; end; function TSynAnsiFixedWidth.IsValidAnsi(WideText: PWideChar; Length: PtrInt): boolean; var i: PtrInt; wc: PtrUInt; begin result := false; if WideText <> nil then for i := 0 to Length - 1 do begin wc := PtrUInt(WideText[i]); if wc = 0 then break else if wc < 256 then if fAnsiToWide[wc] < 256 then continue else exit else if fWideToAnsi[wc] = ord('?') then exit else continue; end; result := true; end; function TSynAnsiFixedWidth.IsValidAnsi(WideText: PWideChar): boolean; var wc: PtrUInt; begin result := false; if WideText <> nil then repeat wc := PtrUInt(WideText^); inc(WideText); if wc = 0 then break else if wc < 256 then if fAnsiToWide[wc] < 256 then continue else exit else if fWideToAnsi[wc] = ord('?') then exit else continue; until false; result := true; end; function TSynAnsiFixedWidth.IsValidAnsiU(Utf8Text: PUtf8Char): boolean; var extra: byte; c, n: cardinal; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} begin {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} result := false; if Utf8Text <> nil then repeat extra := utf8.Lookup[ord(Utf8Text^)]; inc(Utf8Text); if extra = UTF8_ASCII then continue else if extra = UTF8_ZERO then break else if extra = UTF8_INVALID then exit else begin if utf8.Extra[extra].minimum > $ffff then exit; n := extra; c := ord(Utf8Text[-1]); repeat if byte(Utf8Text^) and $c0 <> $80 then exit; // invalid UTF-8 content c := (c shl 6) + byte(Utf8Text^); inc(Utf8Text); dec(n) until n = 0; dec(c, utf8.Extra[extra].offset); if (c > $ffff) or (fWideToAnsi[c] = ord('?')) then exit; // invalid char in the WinAnsi code page end; until false; result := true; end; function TSynAnsiFixedWidth.IsValidAnsiU8Bit(Utf8Text: PUtf8Char): boolean; var extra: byte; c, n: cardinal; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} begin {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} result := false; if Utf8Text <> nil then repeat extra := utf8.Lookup[ord(Utf8Text^)]; inc(Utf8Text); if extra = UTF8_ASCII then continue else if extra = UTF8_ZERO then break else if extra = UTF8_INVALID then exit else begin if utf8.Extra[extra].minimum > $ffff then exit; n := extra; c := ord(Utf8Text[-1]); repeat if byte(Utf8Text^) and $c0 <> $80 then exit; // invalid UTF-8 content c := (c shl 6) + byte(Utf8Text^); inc(Utf8Text); dec(n) until n = 0; dec(c, utf8.Extra[extra].offset); if (c > 255) or (fAnsiToWide[c] > 255) then exit; // not 8-bit char (like "tm" or such) is marked invalid end; until false; result := true; end; function TSynAnsiFixedWidth.UnicodeBufferToAnsi(Dest: PAnsiChar; Source: PWideChar; SourceChars: cardinal): PAnsiChar; var c: cardinal; tab: PAnsiChar; begin if (Source <> nil) and (SourceChars <> 0) then begin // ignore any trailing BOM (do exist on Windows files) if Source^ = BOM_UTF16LE then begin inc(Source); dec(SourceChars); end; // first handle trailing 7-bit ASCII chars, by pairs (Sha optimization) if SourceChars >= 2 then repeat c := PCardinal(Source)^; if c and $ff80ff80 <> 0 then break; // break on first non ASCII pair dec(SourceChars, 2); inc(Source, 2); c := c shr 8 or c; PWord(Dest)^ := c; inc(Dest, 2); until SourceChars < 2; // use internal lookup tables for fast process of remaining chars tab := pointer(fWideToAnsi); for c := 1 to SourceChars shr 2 do begin Dest[0] := tab[Ord(Source[0])]; Dest[1] := tab[Ord(Source[1])]; Dest[2] := tab[Ord(Source[2])]; Dest[3] := tab[Ord(Source[3])]; inc(Source, 4); inc(Dest, 4); end; for c := 1 to SourceChars and 3 do begin Dest^ := tab[Ord(Source^)]; inc(Dest); inc(Source); end; end; result := Dest; end; function TSynAnsiFixedWidth.Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char; SourceChars: cardinal): PAnsiChar; var c: cardinal; endSource, endSourceBy4: PUtf8Char; i, extra: PtrInt; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} label by1, by4, quit; // ugly but faster begin {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} // first handle trailing 7-bit ASCII chars, by quad (Sha optimization) endSource := Source + SourceChars; endSourceBy4 := endSource - 4; if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif} (Source <= endSourceBy4) then repeat c := PCardinal(Source)^; if c and $80808080 <> 0 then goto by1; // break on first non ASCII quad by4: PCardinal(Dest)^ := c; inc(Source, 4); inc(Dest, 4); until Source > endSourceBy4; // generic loop, handling one UTF-8 code per iteration if Source < endSource then begin repeat by1: c := byte(Source^); inc(Source); if ord(c) <= 127 then begin Dest^ := AnsiChar(c); inc(Dest); if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif} (Source <= endSourceBy4) then begin c := PCardinal(Source)^; if c and $80808080 = 0 then goto by4; continue; end; if Source < endSource then continue else break; end else begin extra := utf8.Lookup[c]; if (extra = UTF8_INVALID) or (Source + extra > endSource) then break; i := extra; repeat if byte(Source^) and $c0 <> $80 then goto quit; // invalid UTF-8 content c := (c shl 6) + byte(Source^); inc(Source); dec(i); until i = 0; dec(c, utf8.Extra[extra].offset); if c > $ffff then Dest^ := '?' // '?' as in unknown fWideToAnsi[] items else Dest^ := AnsiChar(fWideToAnsi[c]); inc(Dest); if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif} (Source <= endSourceBy4) then begin c := PCardinal(Source)^; if c and $80808080 = 0 then goto by4; continue; end; if Source < endSource then continue else break; end; until false; end; quit: result := Dest; end; function TSynAnsiFixedWidth.WideCharToAnsiChar(wc: cardinal): integer; begin if wc < 256 then if fAnsiToWide[wc] < 256 then result := wc else result := -1 else if wc <= 65535 then begin result := fWideToAnsi[wc]; if result = ord('?') then result := -1; end else result := -1; end; { TSynAnsiUtf8 } function TSynAnsiUtf8.AnsiBufferToUnicode(Dest: PWideChar; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PWideChar; begin result := Dest + (Utf8ToWideChar(Dest, PUtf8Char(Source), SourceChars, NoTrailingZero) shr 1); end; function TSynAnsiUtf8.AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PUtf8Char; begin MoveFast(Source^, Dest^, SourceChars); if not NoTrailingZero then Dest[SourceChars] := #0; result := Dest + SourceChars; end; {$ifndef PUREMORMOT2} function TSynAnsiUtf8.AnsiToRawUnicode(Source: PAnsiChar; SourceChars: cardinal): RawUnicode; begin result := Utf8DecodeToRawUniCode(PUtf8Char(Source), SourceChars); end; {$endif PUREMORMOT2} constructor TSynAnsiUtf8.Create(aCodePage: cardinal); begin if aCodePage <> CP_UTF8 then raise ESynUnicode.CreateFmt('%s.Create(%d)', [ClassNameShort(self)^, aCodePage]); inherited Create(aCodePage); end; function TSynAnsiUtf8.UnicodeBufferToAnsi(Dest: PAnsiChar; Source: PWideChar; SourceChars: cardinal): PAnsiChar; begin result := Dest + RawUnicodeToUtf8(PUtf8Char(Dest), SourceChars * 3, Source, SourceChars, [ccfNoTrailingZero]); end; function TSynAnsiUtf8.UnicodeBufferToAnsi(Source: PWideChar; SourceChars: cardinal): RawByteString; var tmp: TSynTempBuffer; begin if (Source = nil) or (SourceChars = 0) then result := '' else begin tmp.Init(SourceChars * 3); FastSetStringCP(result, tmp.buf, RawUnicodeToUtf8(tmp.buf, SourceChars * 3, Source, SourceChars, [ccfNoTrailingZero]), fCodePage); tmp.Done; end; end; function TSynAnsiUtf8.Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char; SourceChars: cardinal): PAnsiChar; begin MoveFast(Source^, Dest^, SourceChars); result := Dest + SourceChars; end; procedure TSynAnsiUtf8.Utf8BufferToAnsi(Source: PUtf8Char; SourceChars: cardinal; var result: RawByteString); begin FastSetString(RawUtf8(result), Source, SourceChars); end; function TSynAnsiUtf8.Utf8ToAnsi(const u: RawUtf8): RawByteString; begin result := u; // may be read-only: no FastAssignUtf8/FakeCodePage EnsureRawUtf8(result); end; function TSynAnsiUtf8.AnsiToUtf8(const AnsiText: RawByteString): RawUtf8; begin result := AnsiText; // may be read-only: no FastAssignUtf8/FakeCodePage EnsureRawUtf8(result); end; procedure TSynAnsiUtf8.AnsiBufferToRawUtf8( Source: PAnsiChar; SourceChars: cardinal; out Value: RawUtf8); begin FastSetString(Value, Source, SourceChars); end; { TSynAnsiUtf16 } function TSynAnsiUtf16.AnsiBufferToUnicode(Dest: PWideChar; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PWideChar; begin MoveFast(Source^, Dest^, SourceChars); result := Pointer(PtrUInt(Dest) + SourceChars); if not NoTrailingZero then result^ := #0; end; const NOTRAILING: array[boolean] of TCharConversionFlags = ( [], [ccfNoTrailingZero]); function TSynAnsiUtf16.AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PUtf8Char; begin SourceChars := SourceChars shr 1; // from byte count to WideChar count result := Dest + RawUnicodeToUtf8(Dest, SourceChars * 3, PWideChar(Source), SourceChars, NOTRAILING[NoTrailingZero]); end; {$ifndef PUREMORMOT2} function TSynAnsiUtf16.AnsiToRawUnicode(Source: PAnsiChar; SourceChars: cardinal): RawUnicode; begin SetString(result, Source, SourceChars); // byte count end; {$endif PUREMORMOT2} constructor TSynAnsiUtf16.Create(aCodePage: cardinal); begin if aCodePage <> CP_UTF16 then raise ESynUnicode.CreateFmt('%s.Create(%d)', [ClassNameShort(self)^, aCodePage]); inherited Create(aCodePage); end; function TSynAnsiUtf16.UnicodeBufferToAnsi(Dest: PAnsiChar; Source: PWideChar; SourceChars: cardinal): PAnsiChar; begin SourceChars := SourceChars shl 1; // from WideChar count to byte count MoveFast(Source^, Dest^, SourceChars); result := Dest + SourceChars; end; function TSynAnsiUtf16.Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char; SourceChars: cardinal): PAnsiChar; begin result := Dest + Utf8ToWideChar(PWideChar(Dest), Source, SourceChars, true); end; function IsFixedWidthCodePage(aCodePage: cardinal): boolean; begin result := ((aCodePage >= 1250) and (aCodePage <= 1258)) or (aCodePage = CP_LATIN1) or (aCodePage >= CP_RAWBLOB); end; function CodePageToText(aCodePage: cardinal): TShort16; begin case aCodePage of CP_UTF8: result := 'utf8'; CODEPAGE_US: result := 'WinAnsi'; else begin PCardinal(@result)^ := 2 + ord('c') shl 8 + ord('p') shl 16; AppendShortCardinal(aCodePage, result); end; end; end; { *************** Text File Loading with BOM/Unicode Support } function BomFile(var Buffer: pointer; var BufferSize: PtrInt): TBomFile; begin result := bomNone; if (Buffer <> nil) and (BufferSize >= 2) then case PWord(Buffer)^ of ord(BOM_UTF16LE): begin inc(PByte(Buffer), 2); dec(BufferSize, 2); result := bomUnicode; // UTF-16 LE end; $BBEF: if (BufferSize >= 3) and (PByteArray(Buffer)[2] = $BF) then begin inc(PByte(Buffer), 3); dec(BufferSize, 3); result := bomUtf8; end; end; end; function StringFromBomFile(const FileName: TFileName; out FileContent: RawByteString; out Buffer: pointer; out BufferSize: PtrInt): TBomFile; begin FileContent := StringFromFile(FileName); Buffer := pointer(FileContent); BufferSize := length(FileContent); result := BomFile(Buffer, BufferSize); end; function RawUtf8FromFile(const FileName: TFileName): RawUtf8; begin result := AnyTextFileToRawUtf8(FileName, {AssumeUtf8IfNoBom=}true); end; function AnyTextFileToRawUtf8(const FileName: TFileName; AssumeUtf8IfNoBom: boolean): RawUtf8; var tmp: RawByteString; buf: pointer; len: PtrInt; begin case StringFromBomFile(FileName, tmp, buf, len) of bomNone: if AssumeUtf8IfNoBom then FastAssignUtf8(result, tmp) else CurrentAnsiConvert.AnsiBufferToRawUtf8(buf, len, result); bomUnicode: RawUnicodeToUtf8(PWideChar(buf), len shr 1, result); bomUtf8: if len = 0 then result := '' else begin MoveFast(buf^, pointer(tmp)^, len); // fast in-place delete(bom) FakeLength(tmp, len); FastAssignUtf8(result, tmp) end; end; end; function AnyTextFileToSynUnicode(const FileName: TFileName; ForceUtf8: boolean): SynUnicode; var tmp: RawByteString; buf: pointer; len: PtrInt; begin if ForceUtf8 then Utf8ToSynUnicode(StringFromFile(FileName), result) else case StringFromBomFile(FileName, tmp, buf, len) of bomNone: result := CurrentAnsiConvert.AnsiToUnicodeString(buf, len); bomUnicode: SetString(result, PWideChar(buf), len shr 1); bomUtf8: Utf8ToSynUnicode(buf, len, result); end; end; function AnyTextFileToString(const FileName: TFileName; ForceUtf8: boolean): string; var tmp: RawByteString; buf: pointer; len: PtrInt; begin {$ifdef UNICODE} if ForceUtf8 then Utf8ToStringVar(StringFromFile(FileName), result) else case StringFromBomFile(FileName, tmp, buf, len) of bomNone: result := CurrentAnsiConvert.AnsiToUnicodeString(buf, len); bomUnicode: SetString(result, PWideChar(buf), len shr 1); bomUtf8: Utf8DecodeToString(buf, len, result); end; {$else} if ForceUtf8 then result := CurrentAnsiConvert.Utf8ToAnsi(StringFromFile(FileName)) else case StringFromBomFile(FileName, tmp, buf, len) of bomNone: SetString(result, PAnsiChar(buf), len); bomUnicode: result := CurrentAnsiConvert.UnicodeBufferToAnsi(buf, len shr 1); bomUtf8: result := CurrentAnsiConvert.Utf8BufferToAnsi(buf, len); end; {$endif UNICODE} end; { *************** Low-Level String Conversion Functions } procedure AnyAnsiToUtf8(const s: RawByteString; var result: RawUtf8); {$ifdef HASCODEPAGE} var cp: cardinal; {$endif HASCODEPAGE} begin if s = '' then result := '' else {$ifdef HASCODEPAGE} begin cp := GetCodePage(s); if cp = CP_ACP then begin cp := Unicode_CodePage; {$ifdef FPC} if cp = CP_UTF8 then // happens on POSIX and with Lazarus - so FPC only begin if PStrRec(PAnsiChar(pointer(s)) - _STRRECSIZE)^.refCnt >= 0 then begin result := s; // not a read-only constant: assign by ref FakeCodePage(RawByteString(result), cp); // override 0 by CP_UTF8 end else FastSetString(result, pointer(s), length(s)); // realloc constant exit; end; {$endif FPC} end; if cp = CP_UTF8 then result := s else if cp >= CP_RAWBLOB then FastSetString(result, pointer(s), length(s)) // no convert, just copy else TSynAnsiConvert.Engine(cp).AnsiBufferToRawUtf8(pointer(s), length(s), result); end; {$else} CurrentAnsiConvert.AnsiBufferToRawUtf8(pointer(s), length(s), result); {$endif HASCODEPAGE} end; function AnyAnsiToUtf8(const s: RawByteString): RawUtf8; begin AnyAnsiToUtf8(s, result); end; function WinAnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar; SourceChars: cardinal): PUtf8Char; begin result := WinAnsiConvert.AnsiBufferToUtf8(Dest, Source, SourceChars); end; function ShortStringToUtf8(const source: ShortString): RawUtf8; begin WinAnsiConvert.AnsiBufferToRawUtf8(@source[1], ord(source[0]), result); end; procedure WinAnsiToUnicodeBuffer(const S: WinAnsiString; Dest: PWordArray; DestLen: PtrInt); var L: PtrInt; begin L := length(S); if L <> 0 then begin if L >= DestLen then L := DestLen - 1; // truncate to avoid buffer overflow WinAnsiConvert.AnsiBufferToUnicode(PWideChar(Dest), pointer(S), L); // including last #0 end else Dest^[0] := 0; end; {$ifndef PUREMORMOT2} function WinAnsiToRawUnicode(const S: WinAnsiString): RawUnicode; begin result := WinAnsiConvert.AnsiToRawUnicode(S); end; {$endif PUREMORMOT2} function WinAnsiToUtf8(const S: WinAnsiString): RawUtf8; begin WinAnsiConvert.AnsiBufferToRawUtf8(pointer(S), length(S), result); end; function WinAnsiToUtf8(WinAnsi: PAnsiChar; WinAnsiLen: PtrInt): RawUtf8; begin WinAnsiConvert.AnsiBufferToRawUtf8(WinAnsi, WinAnsiLen, result); end; function WideCharToWinAnsiChar(wc: cardinal): AnsiChar; begin wc := WinAnsiConvert.WideCharToAnsiChar(wc); if integer(wc) = -1 then result := '?' else result := AnsiChar(wc); end; function WideCharToWinAnsi(wc: cardinal): integer; begin result := WinAnsiConvert.WideCharToAnsiChar(wc); end; function IsWinAnsi(WideText: PWideChar; Length: integer): boolean; begin result := WinAnsiConvert.IsValidAnsi(WideText, Length); end; function IsWinAnsi(WideText: PWideChar): boolean; begin result := WinAnsiConvert.IsValidAnsi(WideText); end; function IsWinAnsiU(Utf8Text: PUtf8Char): boolean; begin result := WinAnsiConvert.IsValidAnsiU(Utf8Text); end; function IsWinAnsiU8Bit(Utf8Text: PUtf8Char): boolean; begin result := WinAnsiConvert.IsValidAnsiU8Bit(Utf8Text); end; function Utf8ToWinPChar(dest: PAnsiChar; source: PUtf8Char; count: integer): integer; begin result := WinAnsiConvert.Utf8BufferToAnsi(dest, source, count) - dest; end; function Utf8ToWinAnsi(const S: RawUtf8): WinAnsiString; begin result := WinAnsiConvert.Utf8ToAnsi(S); end; function Utf8ToWinAnsi(P: PUtf8Char): WinAnsiString; begin result := WinAnsiConvert.Utf8ToAnsi(P); end; procedure Utf8ToRawUtf8(P: PUtf8Char; var result: RawUtf8); begin // fast and Delphi 2009+ ready FastSetString(result, P, StrLen(P)); end; {$ifndef PUREMORMOT2} function Utf8DecodeToRawUnicode(P: PUtf8Char; L: integer): RawUnicode; var tmp: TSynTempBuffer; begin result := ''; // somewhat faster if result is freed before any SetLength() if L = 0 then L := StrLen(P); if L = 0 then exit; // +1 below is for #0 ending -> true WideChar(#0) ending tmp.Init(L * 3); // maximum posible unicode size (if all <#128) SetString(result, PAnsiChar(tmp.buf), Utf8ToWideChar(tmp.buf, P, L) + 1); tmp.Done; end; function Utf8DecodeToRawUnicode(const S: RawUtf8): RawUnicode; begin if S = '' then result := '' else result := Utf8DecodeToRawUnicode(pointer(S), Length(S)); end; function Utf8DecodeToRawUnicodeUI(const S: RawUtf8; DestLen: PInteger): RawUnicode; var L: integer; begin L := Utf8DecodeToRawUnicodeUI(S, result); if DestLen <> nil then DestLen^ := L; end; function Utf8DecodeToRawUnicodeUI(const S: RawUtf8; var Dest: RawUnicode): integer; begin Dest := ''; // somewhat faster if Dest is freed before any SetLength() if S = '' then begin result := 0; exit; end; result := Length(S); SetLength(Dest, result * 2 + 2); result := Utf8ToWideChar(pointer(Dest), Pointer(S), result); end; function RawUnicodeToUtf8(const Unicode: RawUnicode): RawUtf8; begin RawUnicodeToUtf8(pointer(Unicode), Length(Unicode) shr 1, result); end; function RawUnicodeToSynUnicode(const Unicode: RawUnicode): SynUnicode; begin SetString(result, PWideChar(pointer(Unicode)), Length(Unicode) shr 1); end; function RawUnicodeToWinAnsi(const Unicode: RawUnicode): WinAnsiString; begin result := WinAnsiConvert.UnicodeBufferToAnsi(pointer(Unicode), Length(Unicode) shr 1); end; {$endif PUREMORMOT2} function SynUnicodeToUtf8(const Unicode: SynUnicode): RawUtf8; begin RawUnicodeToUtf8(pointer(Unicode), Length(Unicode), result); end; function RawUnicodeToSynUnicode(WideChar: PWideChar; WideCharCount: integer): SynUnicode; begin SetString(result, WideChar, WideCharCount); end; procedure RawUnicodeToWinPChar(dest: PAnsiChar; source: PWideChar; WideCharCount: integer); begin WinAnsiConvert.UnicodeBufferToAnsi(dest, source, WideCharCount); end; function RawUnicodeToWinAnsi(WideChar: PWideChar; WideCharCount: integer): WinAnsiString; begin result := WinAnsiConvert.UnicodeBufferToAnsi(WideChar, WideCharCount); end; function WideStringToWinAnsi(const Wide: WideString): WinAnsiString; begin result := WinAnsiConvert.UnicodeBufferToAnsi(pointer(Wide), Length(Wide)); end; procedure UnicodeBufferToWinAnsi(source: PWideChar; out Dest: WinAnsiString); var L: integer; begin L := StrLenW(source); SetLength(Dest, L); WinAnsiConvert.UnicodeBufferToAnsi(pointer(Dest), source, L); end; function UnicodeBufferToString(source: PWideChar): string; begin result := RawUnicodeToString(source, StrLenW(source)); end; function UnicodeBufferToUtf8(source: PWideChar): RawUtf8; begin RawUnicodeToUtf8(source, StrLenW(source), result); end; function UnicodeBufferToVariant(source: PWideChar): variant; begin ClearVariantForString(result); if Source <> nil then RawUnicodeToUtf8(source, StrLenW(source), RawUtf8(TVarData(result).VAny)); end; function StringToVariant(const Txt: string): variant; begin StringToVariant(Txt, result); end; procedure StringToVariant(const Txt: string; var result: variant); begin ClearVariantForString(result); if Txt <> '' then {$ifndef UNICODE} if (Unicode_CodePage = CP_UTF8) or IsAnsiCompatible(Txt) then begin RawByteString(TVarData(result).VAny) := Txt; EnsureRawUtf8(RawByteString(TVarData(result).VAny)); end else {$endif UNICODE} StringToUtf8(Txt, RawUtf8(TVarData(result).VAny)); end; procedure AnsiCharToUtf8(P: PAnsiChar; L: integer; var result: RawUtf8; CodePage: integer); begin TSynAnsiConvert.Engine(CodePage).AnsiBufferToRawUtf8(P, L, result); end; function AnsiToUtf8(const Ansi: RawByteString; CodePage: integer): RawUtf8; begin if Ansi = '' then result := '' else result := TSynAnsiConvert.Engine(CodePage).AnsiToUtf8(Ansi); end; function AnsiToString(const Ansi: RawByteString; CodePage: integer): string; begin if Ansi = '' then result := '' else {$ifdef UNICODE} result := TSynAnsiConvert.Engine(CodePage).AnsiToUnicodeString(Ansi); {$else} result := CurrentAnsiConvert.AnsiToAnsi(TSynAnsiConvert.Engine(CodePage), Ansi); {$endif UNICODE} end; function AnsiBufferToTempUtf8(var Temp: TSynTempBuffer; Buf: PAnsiChar; BufLen, CodePage: cardinal): PUtf8Char; begin if (BufLen = 0) or (CodePage = CP_UTF8) or (CodePage >= CP_RAWBLOB) or IsAnsiCompatible(Buf, BufLen) then begin temp.Buf := nil; temp.len := BufLen; result := PUtf8Char(Buf); end else begin temp.Init(BufLen * 3); Buf := pointer(TSynAnsiConvert.Engine(CodePage). AnsiBufferToUtf8(temp.Buf, Buf, BufLen)); temp.len := Buf - PAnsiChar(temp.Buf); result := temp.Buf; end; end; {$ifdef UNICODE} function Ansi7ToString(const Text: RawByteString): string; var i: PtrInt; begin SetString(result, nil, Length(Text)); for i := 0 to Length(Text) - 1 do PWordArray(result)[i] := PByteArray(Text)[i]; // no conversion for 7-bit Ansi end; function Ansi7ToString(Text: PWinAnsiChar; Len: PtrInt): string; begin Ansi7ToString(Text, Len, result); end; procedure Ansi7ToString(Text: PWinAnsiChar; Len: PtrInt; var result: string); var i: PtrInt; begin SetString(result, nil, Len); for i := 0 to Len - 1 do PWordArray(result)[i] := PByteArray(Text)[i]; // no conversion for 7-bit Ansi end; function StringToAnsi7(const Text: string): RawByteString; var i: PtrInt; begin SetString(result, nil, Length(Text)); for i := 0 to Length(Text) - 1 do PByteArray(result)[i] := PWordArray(Text)[i]; // no conversion for 7-bit Ansi end; function StringToWinAnsi(const Text: string): WinAnsiString; begin result := RawUnicodeToWinAnsi(Pointer(Text), Length(Text)); end; function StringBufferToUtf8(Dest: PUtf8Char; Source: PChar; SourceChars: PtrInt): PUtf8Char; begin result := Dest + RawUnicodeToUtf8(Dest, SourceChars * 3, PWideChar(Source), SourceChars, []); end; procedure StringBufferToUtf8(Source: PChar; out result: RawUtf8); begin RawUnicodeToUtf8(Source, StrLenW(Source), result); end; function StringToUtf8(const Text: string): RawUtf8; begin RawUnicodeToUtf8(pointer(Text), Length(Text), result); end; procedure StringToUtf8(Text: PChar; TextLen: PtrInt; var result: RawUtf8); begin RawUnicodeToUtf8(Text, TextLen, result); end; procedure StringToUtf8(const Text: string; var result: RawUtf8); begin RawUnicodeToUtf8(pointer(Text), Length(Text), result); end; function StringToUtf8(const Text: string; var Temp: TSynTempBuffer): integer; var len: integer; begin len := length(Text); Temp.Init(len * 3); result := RawUnicodeToUtf8(Temp.buf, Temp.len + 1, pointer(Text), len, []); end; function ToUtf8(const Text: string): RawUtf8; begin RawUnicodeToUtf8(pointer(Text), Length(Text), result); end; {$ifndef PUREMORMOT2} function StringToRawUnicode(const S: string): RawUnicode; begin SetString(result, PAnsiChar(pointer(S)), length(S) * 2 + 1); // +1 for last wide #0 end; function StringToRawUnicode(P: PChar; L: integer): RawUnicode; begin SetString(result, PAnsiChar(P), L * 2 + 1); // +1 for last wide #0 end; function RawUnicodeToString(const U: RawUnicode): string; begin // uses StrLenW() and not length(U) to handle case when was used as buffer SetString(result, PWideChar(pointer(U)), StrLenW(Pointer(U))); end; {$endif PUREMORMOT2} function StringToSynUnicode(const S: string): SynUnicode; begin result := S; end; procedure StringToSynUnicode(const S: string; var result: SynUnicode); begin result := S; end; function RawUnicodeToString(P: PWideChar; L: integer): string; begin SetString(result, P, L); end; procedure RawUnicodeToString(P: PWideChar; L: integer; var result: string); begin SetString(result, P, L); end; function SynUnicodeToString(const U: SynUnicode): string; begin result := U; end; function Utf8DecodeToString(P: PUtf8Char; L: integer): string; begin Utf8DecodeToUnicodeString(P, L, result); end; procedure Utf8DecodeToString(P: PUtf8Char; L: integer; var result: string); begin Utf8DecodeToUnicodeString(P, L, result); end; function Utf8ToString(const Text: RawUtf8): string; begin Utf8DecodeToUnicodeString(pointer(Text), length(Text), result); end; procedure Utf8ToStringVar(const Text: RawUtf8; var result: string); begin Utf8DecodeToUnicodeString(pointer(Text), length(Text), result); end; procedure Utf8ToFileName(const Text: RawUtf8; var result: TFileName); begin Utf8DecodeToUnicodeString(pointer(Text), length(Text), string(result)); end; {$else} function Ansi7ToString(const Text: RawByteString): string; begin result := Text; // if we are SURE this text is 7-bit Ansi -> direct assign {$ifdef FPC} // if Text is CP_RAWBYTESTRING then FPC won't handle it properly SetCodePage(RawByteString(result), DefaultSystemCodePage, false); {$endif FPC} // no FakeCodePage() since Text may be read-only end; function Ansi7ToString(Text: PWinAnsiChar; Len: PtrInt): string; begin SetString(result, PAnsiChar(Text), Len); end; procedure Ansi7ToString(Text: PWinAnsiChar; Len: PtrInt; var result: string); begin SetString(result, PAnsiChar(Text), Len); end; function StringToAnsi7(const Text: string): RawByteString; begin result := Text; // if we are SURE this text is 7-bit Ansi -> direct assign end; function StringToWinAnsi(const Text: string): WinAnsiString; begin result := WinAnsiConvert.AnsiToAnsi(CurrentAnsiConvert, Text); end; function StringBufferToUtf8(Dest: PUtf8Char; Source: PChar; SourceChars: PtrInt): PUtf8Char; begin result := CurrentAnsiConvert.AnsiBufferToUtf8(Dest, Source, SourceChars); end; procedure StringBufferToUtf8(Source: PChar; out result: RawUtf8); begin CurrentAnsiConvert.AnsiBufferToRawUtf8(Source, StrLen(Source), result); end; function StringToUtf8(const Text: string): RawUtf8; begin result := CurrentAnsiConvert.AnsiToUtf8(Text); end; procedure StringToUtf8(Text: PChar; TextLen: PtrInt; var result: RawUtf8); begin CurrentAnsiConvert.AnsiBufferToRawUtf8(Text, TextLen, result); end; procedure StringToUtf8(const Text: string; var result: RawUtf8); begin result := CurrentAnsiConvert.AnsiToUtf8(Text); end; function StringToUtf8(const Text: string; var Temp: TSynTempBuffer): integer; var len: integer; begin len := length(Text); Temp.Init(len * 3); if len <> 0 then result := CurrentAnsiConvert. AnsiBufferToUtf8(Temp.buf, pointer(Text), len) - PUtf8Char(Temp.buf) else result := 0; end; function ToUtf8(const Text: string): RawUtf8; begin result := CurrentAnsiConvert.AnsiToUtf8(Text); end; {$ifndef PUREMORMOT2} function StringToRawUnicode(const S: string): RawUnicode; begin result := CurrentAnsiConvert.AnsiToRawUnicode(S); end; function StringToRawUnicode(P: PChar; L: integer): RawUnicode; begin result := CurrentAnsiConvert.AnsiToRawUnicode(P, L); end; function RawUnicodeToString(const U: RawUnicode): string; begin // uses StrLenW() and not length(U) to handle case when was used as buffer result := CurrentAnsiConvert.UnicodeBufferToAnsi(Pointer(U), StrLenW(Pointer(U))); end; {$endif PUREMORMOT2} function StringToSynUnicode(const S: string): SynUnicode; begin result := CurrentAnsiConvert.AnsiToUnicodeString(pointer(S), length(S)); end; procedure StringToSynUnicode(const S: string; var result: SynUnicode); begin result := CurrentAnsiConvert.AnsiToUnicodeString(pointer(S), length(S)); end; function RawUnicodeToString(P: PWideChar; L: integer): string; begin result := CurrentAnsiConvert.UnicodeBufferToAnsi(P, L); end; procedure RawUnicodeToString(P: PWideChar; L: integer; var result: string); begin result := CurrentAnsiConvert.UnicodeBufferToAnsi(P, L); end; function SynUnicodeToString(const U: SynUnicode): string; begin result := CurrentAnsiConvert.UnicodeBufferToAnsi(Pointer(U), length(U)); end; function Utf8DecodeToString(P: PUtf8Char; L: integer): string; begin CurrentAnsiConvert.Utf8BufferToAnsi(P, L, RawByteString(result)); end; procedure Utf8DecodeToString(P: PUtf8Char; L: integer; var result: string); begin CurrentAnsiConvert.Utf8BufferToAnsi(P, L, RawByteString(result)); end; function Utf8ToString(const Text: RawUtf8): string; begin result := CurrentAnsiConvert.Utf8ToAnsi(Text); end; procedure Utf8ToStringVar(const Text: RawUtf8; var result: string); begin result := CurrentAnsiConvert.Utf8ToAnsi(Text); end; procedure Utf8ToFileName(const Text: RawUtf8; var result: TFileName); begin result := CurrentAnsiConvert.Utf8ToAnsi(Text); end; {$endif UNICODE} function ToUtf8(const Ansi7Text: ShortString): RawUtf8; begin FastSetString(result, @Ansi7Text[1], ord(Ansi7Text[0])); end; {$ifdef HASVARUSTRING} // some UnicodeString dedicated functions function UnicodeStringToUtf8(const S: UnicodeString): RawUtf8; begin RawUnicodeToUtf8(pointer(S), Length(S), result); end; function Utf8DecodeToUnicodeString(const S: RawUtf8): UnicodeString; begin Utf8DecodeToUnicodeString(pointer(S), Length(S), result); end; procedure Utf8DecodeToUnicodeString(P: PUtf8Char; L: integer; var result: UnicodeString); var tmp: TSynTempBuffer; begin if (P = nil) or (L = 0) then result := '' else begin tmp.Init(L * 3); // maximum posible unicode size (if all <#128) SetString(result, PWideChar(tmp.buf), Utf8ToWideChar(tmp.buf, P, L) shr 1); tmp.Done; end; end; function UnicodeStringToWinAnsi(const S: UnicodeString): WinAnsiString; begin result := WinAnsiConvert.UnicodeBufferToAnsi(pointer(S), Length(S)); end; function Utf8DecodeToUnicodeString(P: PUtf8Char; L: integer): UnicodeString; begin Utf8DecodeToUnicodeString(P, L, result); end; function WinAnsiToUnicodeString(WinAnsi: PAnsiChar; WinAnsiLen: PtrInt): UnicodeString; begin SetString(result, nil, WinAnsiLen); WinAnsiConvert.AnsiBufferToUnicode(pointer(result), WinAnsi, WinAnsiLen); end; function WinAnsiToUnicodeString(const WinAnsi: WinAnsiString): UnicodeString; begin result := WinAnsiToUnicodeString(pointer(WinAnsi), Length(WinAnsi)); end; {$endif HASVARUSTRING} function Utf8DecodeToUnicodeRawByteString(P: PUtf8Char; L: integer): RawByteString; begin if (P <> nil) and (L <> 0) then begin FastNewRawByteString(result, L * 3); FakeSetLength(result, Utf8ToWideChar(pointer(result), P, L)); end else result := ''; end; function Utf8DecodeToUnicodeRawByteString(const U: RawUtf8): RawByteString; begin result := Utf8DecodeToUnicodeRawByteString(pointer(U), length(U)); end; function Utf8DecodeToUnicodeStream(P: PUtf8Char; L: integer): TStream; begin result := TRawByteStringStream.Create(Utf8DecodeToUnicodeRawByteString(P, L)); end; function WinAnsiToSynUnicode(WinAnsi: PAnsiChar; WinAnsiLen: PtrInt): SynUnicode; begin SetString(result, nil, WinAnsiLen); WinAnsiConvert.AnsiBufferToUnicode(pointer(result), WinAnsi, WinAnsiLen); end; function WinAnsiToSynUnicode(const WinAnsi: WinAnsiString): SynUnicode; begin result := WinAnsiToSynUnicode(pointer(WinAnsi), Length(WinAnsi)); end; procedure UniqueRawUtf8ZeroToTilde(var u: RawUtf8; MaxSize: PtrInt); var i: PtrInt; begin i := length(u); if i > MaxSize then PByteArray(u)[MaxSize] := 0 else MaxSize := i; for i := 0 to MaxSize - 1 do if PByteArray(u)[i] = 0 then PByteArray(u)[i] := ord('~'); end; const ZEROED_CW = '~'; // any byte would do - followed by ~ or 0 function UnZeroed(const bin: RawByteString): RawUtf8; var len, z, c: PtrInt; a: AnsiChar; s, d: PAnsiChar; begin result := ''; len := length(bin); if len = 0 then exit; s := pointer(bin); z := StrLen(s); c := ByteScanIndex(pointer(s), len, ord(ZEROED_CW)); if (z = len) and (c < 0) then begin result := bin; // nothing to convert exit; end; if (c < 0) or (z < c) then c := z; FastSetString(result, len shl 1); d := pointer(result); MoveFast(s^, d^, c); inc(s, c); inc(d, c); dec(len, c); repeat a := s^; if a = #0 then begin d^ := ZEROED_CW; inc(d); a := '0'; end else if a = ZEROED_CW then begin d^ := ZEROED_CW; inc(d); end; d^ := a; inc(d); inc(s); dec(len); until len = 0; FakeLength(result, d - pointer(result)); end; function Zeroed(const u: RawUtf8): RawByteString; var len, c: PtrInt; a: AnsiChar; s, d: PAnsiChar; begin result := ''; len := length(u); if len = 0 then exit; s := pointer(u); c := ByteScanIndex(pointer(s), len, ord(ZEROED_CW)); if c < 0 then begin result := u; exit; end; FastNewRawByteString(result, len); d := pointer(result); MoveFast(s^, d^, c); inc(s, c); inc(d, c); dec(len, c); repeat a := s^; if a = ZEROED_CW then begin inc(s); dec(len); if s^ = '0' then a := #0; end; d^ := a; inc(d); inc(s); dec(len); until len = 0; FakeLength(result, d - pointer(result)); end; procedure Utf8ToWideString(const Text: RawUtf8; var result: WideString); begin Utf8ToWideString(pointer(Text), Length(Text), result); end; function Utf8ToWideString(const Text: RawUtf8): WideString; begin {$ifdef FPC} Finalize(result); {$endif FPC} Utf8ToWideString(pointer(Text), Length(Text), result); end; procedure Utf8ToWideString(Text: PUtf8Char; Len: PtrInt; var result: WideString); var tmp: TSynTempBuffer; begin if (Text = nil) or (Len = 0) then result := '' else begin tmp.Init(Len * 3); // maximum posible unicode size (if all <#128) SetString(result, PWideChar(tmp.buf), Utf8ToWideChar(tmp.buf, Text, Len) shr 1); tmp.Done; end; end; function WideStringToUtf8(const aText: WideString): RawUtf8; begin RawUnicodeToUtf8(pointer(aText), length(aText), result); end; function Utf8ToSynUnicode(const Text: RawUtf8): SynUnicode; begin Utf8ToSynUnicode(pointer(Text), length(Text), result); end; procedure Utf8ToSynUnicode(const Text: RawUtf8; var result: SynUnicode); begin Utf8ToSynUnicode(pointer(Text), length(Text), result); end; procedure Utf8ToSynUnicode(Text: PUtf8Char; Len: PtrInt; var result: SynUnicode); var tmp: TSynTempBuffer; n: PtrInt; begin n := Utf8DecodeToUnicode(Text, Len, tmp); SetString(result, PWideChar(tmp.buf), n); if n <> 0 then tmp.Done; end; function Utf8DecodeToUnicode(const Text: RawUtf8; var temp: TSynTempBuffer): PtrInt; begin result := Utf8DecodeToUnicode(pointer(Text), length(Text), temp); end; function Utf8DecodeToUnicode(Text: PUtf8Char; Len: PtrInt; var temp: TSynTempBuffer): PtrInt; begin if (Text = nil) or (Len <= 0) then begin temp.buf := nil; temp.len := 0; result := 0; end else begin temp.Init(Len * 3); // maximum posible unicode size (if all <#128) result := Utf8ToWideChar(temp.buf, Text, Len) shr 1; end; end; { **************** Text Case-(in)sensitive Conversion and Comparison } function IdemPropNameUSameLenNotNull(P1, P2: PUtf8Char; P1P2Len: PtrInt): boolean; label zero; begin {$ifndef CPUX86} result := false; {$endif CPUX86} pointer(P1P2Len) := @P1[P1P2Len - SizeOf(cardinal)]; dec(PtrUInt(P2), PtrUInt(P1)); while PtrUInt(P1P2Len) >= PtrUInt(P1) do // compare 4 Bytes per loop if (PCardinal(P1)^ xor PCardinal(@P2[PtrUInt(P1)])^) and $dfdfdfdf <> 0 then goto zero else inc(PCardinal(P1)); inc(P1P2Len, SizeOf(cardinal)); while PtrUInt(P1) < PtrUInt(P1P2Len) do if (ord(P1^) xor ord(P2[PtrUInt(P1)])) and $df <> 0 then goto zero else inc(PByte(P1)); result := true; exit; zero: {$ifdef CPUX86} result := false; {$endif CPUX86} end; function PropNameValid(P: PUtf8Char): boolean; var tab: PTextCharSet; {%H-}begin tab := @TEXT_CHARS; if (P <> nil) and (tcIdentifierFirstChar in tab[P^]) then // first char must be in ['_', 'a'..'z', 'A'..'Z'] repeat inc(P); // following chars can be ['_', '0'..'9', 'a'..'z', 'A'..'Z'] if tcIdentifier in tab[P^] then continue; result := P^ = #0; exit; until false else result := false; end; function PropNamesValid(const Values: array of RawUtf8): boolean; var i, j: PtrInt; tab: PTextCharSet; begin result := false; tab := @TEXT_CHARS; for i := 0 to high(Values) do for j := 1 to length(Values[i]) do if not (tcIdentifier in tab[Values[i][j]]) then exit; // not ['_', '0'..'9', 'a'..'z', 'A'..'Z'] result := true; end; function PropNameSanitize(const text, fallback: RawUtf8): RawUtf8; var i: PtrInt; begin result := text; for i := length(result) downto 1 do if result[i] in [#0 .. ' ', '#', '"', '''', '*'] then delete(result, i, 1); for i := 1 to length(result) do if result[i] in ['[', ']', '/', '\', '&', '@', '+', '-', '.'] then result[i] := '_'; if not PropNameValid(pointer(result)) then result := fallback; // it was not good enough end; function IdemPropName(const P1, P2: ShortString): boolean; begin result := (P1[0] = P2[0]) and ((P1[0] = #0) or (((ord(P1[1]) xor ord(P2[1])) and $df = 0) and IdemPropNameUSameLenNotNull(@P1[1], @P2[1], ord(P2[0])))); end; function IdemPropName(const P1: ShortString; P2: PUtf8Char; P2Len: PtrInt): boolean; begin result := (ord(P1[0]) = P2Len) and ((P2Len = 0) or IdemPropNameUSameLenNotNull(@P1[1], P2, P2Len)); end; function IdemPropName(P1, P2: PUtf8Char; P1Len, P2Len: PtrInt): boolean; begin result := (P1Len = P2Len) and ((P1Len = 0) or IdemPropNameUSameLenNotNull(P1, P2, P1Len)); end; function IdemPropNameU(const P1: RawUtf8; P2: PUtf8Char; P2Len: PtrInt): boolean; begin if PtrUInt(P1) <> 0 then result := (PStrLen(PAnsiChar(pointer(P1)) - _STRLEN)^ = P2Len) and ((PByte(P1)^ xor PByte(P2)^) and $df = 0) and IdemPropNameUSameLenNotNull(pointer(P1), pointer(P2), P2Len) else result := P2Len = 0; end; function IdemPropNameU(const P1, P2: RawUtf8): boolean; var L: TStrLen; begin if PtrUInt(P1) <> PtrUInt(P2) then if (PtrUInt(P1) <> 0) and (PtrUInt(P2) <> 0) then begin L := PStrLen(PAnsiChar(pointer(P1)) - _STRLEN)^; result := (PStrLen(PAnsiChar(pointer(P2)) - _STRLEN)^ = L) and ((PByte(P1)^ xor PByte(P2)^) and $df = 0) and IdemPropNameUSameLenNotNull(pointer(P1), pointer(P2), L); end else result := false else result := true; end; function IdemPChar(p: PUtf8Char; up: PAnsiChar): boolean; var {$ifdef CPUX86NOTPIC} table: TNormTable absolute NormToUpperAnsi7; {$else} table: PNormTable; // faster on PIC/ARM and x86_64 {$endif CPUX86NOTPIC} begin result := false; if p = nil then exit; if up <> nil then begin dec(PtrUInt(p), PtrUInt(up)); {$ifndef CPUX86NOTPIC} table := @NormToUpperAnsi7; {$endif CPUX86NOTPIC} while true do if up^ = #0 then break else if table[up[PtrUInt(p)]] <> up^ then exit else inc(up); end; result := true; end; function IdemPChar(p: PUtf8Char; up: PAnsiChar; table: PNormTable): boolean; begin result := false; if p = nil then exit; if up <> nil then begin dec(PtrUInt(p), PtrUInt(up)); while true do if up^ = #0 then break else if table[up[PtrUInt(p)]] <> up^ then exit else inc(up); end; result := true; end; function IdemPCharAnsi( {$ifdef CPUX86NOTPIC} const table: TNormTable; {$else} const table: PNormTable; {$endif CPUX86NOTPIC} p: PUtf8Char; up: PAnsiChar): boolean; {$ifdef HASINLINE}inline;{$endif} begin // in this local IdemPChar() version, p and up are expected to be <> nil result := false; dec(PtrUInt(p), PtrUInt(up)); while true do if up^ = #0 then break else if table[up[PtrUInt(p)]] <> up^ then exit else inc(up); result := true; end; function IdemPCharByte( {$ifdef CPUX86NOTPIC} const table: TNormTableByte; {$else} const table: PByteArray; {$endif CPUX86NOTPIC} p: PUtf8Char; up: PAnsiChar): boolean; {$ifdef HASINLINE}inline;{$endif} begin // in this local IdemPChar() version, p and up are expected to be <> nil result := false; dec(PtrUInt(p), PtrUInt(up)); while true do if up^ = #0 then break else if table[PtrInt(up[PtrUInt(p)])] <> PByte(up)^ then exit else inc(up); result := true; end; function IdemPCharWithoutWhiteSpace(p: PUtf8Char; up: PAnsiChar): boolean; begin result := False; if p = nil then exit; if up <> nil then while up^ <> #0 do begin while p^ <= ' ' do // trim white space if p^ = #0 then exit else inc(p); if up^ <> NormToUpperAnsi7[p^] then exit; inc(up); inc(p); end; result := true; end; function IdemPCharArray(p: PUtf8Char; const upArray: array of PAnsiChar): integer; var w: word; up: ^PAnsiChar; {$ifdef CPUX86NOTPIC} tab: TNormTableByte absolute NormToUpperAnsi7; {$else} tab: PByteArray; // faster on PIC/ARM and x86_64 {$endif CPUX86NOTPIC} begin if p <> nil then begin {$ifndef CPUX86NOTPIC} tab := @NormToUpperAnsi7; {$endif CPUX86NOTPIC} w := tab[ord(p[0])] + tab[ord(p[1])] shl 8; up := @upArray[0]; for result := 0 to high(upArray) do if (PWord(up^)^ = w) and IdemPCharByte(tab, p + 2, up^ + 2) then exit else inc(up); end; result := -1; end; function IdemPPChar(p: PUtf8Char; up: PPAnsiChar): PtrInt; var w: word; u: PAnsiChar; p2: PtrUInt; c: byte; {$ifdef CPUX86NOTPIC} tab: TNormTableByte absolute NormToUpperAnsi7; {$else} tab: PByteArray; // faster on PIC/ARM and x86_64 {$endif CPUX86NOTPIC} begin if p <> nil then begin // uppercase the first two p^ chars {$ifndef CPUX86NOTPIC} tab := @NormToUpperAnsi7; {$endif CPUX86NOTPIC} w := tab[ord(p[0])] + tab[ord(p[1])] shl 8; result := 0; repeat // quickly check the first 2 up^[result] chars u := PPointerArray(up)[result]; if u = nil then break else if PWord(u)^ <> w then begin inc(result); continue; end; // inlined if IdemPCharByte(tab, p + 2, up^ + 2) then exit p2 := PtrUInt(p); dec(p2, PtrUInt(u)); inc(u, 2); repeat c := PByte(u)^; if c = 0 then exit // found IdemPChar(p^, up^[result]) else if tab[PtrUInt(u[p2])] <> c then break; // at least one char doesn't match inc(u); until false; inc(result); until false; end; result := -1; end; function IdemPCharArrayBy2(p: PUtf8Char; const upArrayBy2Chars: RawUtf8): PtrInt; begin if p <> nil then result := WordScanIndex(pointer(upArrayBy2Chars), length(upArrayBy2Chars) shr 1, NormToUpperAnsi7Byte[ord(p[0])] + NormToUpperAnsi7Byte[ord(p[1])] shl 8) else result := -1; end; function IdemPCharU(p, up: PUtf8Char): boolean; begin result := false; if (p = nil) or (up = nil) then exit; while up^ <> #0 do begin if GetNextUtf8Upper(p) <> ord(up^) then exit; inc(up); end; result := true; end; function IdemPCharW(p: PWideChar; up: PUtf8Char): boolean; begin result := false; if (p = nil) or (up = nil) then exit; while up^ <> #0 do begin if (p^ > #255) or (up^ <> AnsiChar(NormToUpperByte[ord(p^)])) then exit; inc(up); inc(p); end; result := true; end; function StartWith(const text, upTextStart: RawUtf8): boolean; begin result := (PtrUInt(text) <> 0) and (PtrUInt(upTextStart) <> 0) and (PStrLen(PAnsiChar(pointer(text)) - _STRLEN)^ >= PStrLen(PAnsiChar(pointer(upTextStart)) - _STRLEN)^) and IdemPChar(pointer(text), pointer(upTextStart)); end; function EndWith(const text, upTextEnd: RawUtf8): boolean; var o: PtrInt; begin o := length(text) - length(upTextEnd); result := (o >= 0) and IdemPChar(PUtf8Char(pointer(text)) + o, pointer(upTextEnd)); end; function EndWithArray(const text: RawUtf8; const upArray: array of RawUtf8): integer; var t, o: PtrInt; {$ifdef CPUX86NOTPIC} tab: TNormTableByte absolute NormToUpperAnsi7; {$else} tab: PByteArray; // faster on PIC/ARM and x86_64 {$endif CPUX86NOTPIC} begin t := length(text); if t > 0 then begin {$ifndef CPUX86NOTPIC} tab := @NormToUpperAnsi7; {$endif CPUX86NOTPIC} for result := 0 to high(upArray) do begin o := t - length(upArray[result]); if (o >= 0) and ((upArray[result] = '') or IdemPCharByte(tab, PUtf8Char(pointer(text)) + o, pointer(upArray[result]))) then exit; end; end; result := -1; end; function IdemFileExt(p: PUtf8Char; extup: PAnsiChar; sepChar: AnsiChar): boolean; var ext: PUtf8Char; begin if (p <> nil) and (extup <> nil) then begin ext := nil; repeat if p^ = sepChar then ext := p; // get last '.' position from p into ext inc(p); until p^ = #0; result := IdemPChar(ext, extup); end else result := false; end; function IdemFileExts(p: PUtf8Char; const extup: array of PAnsiChar; sepChar: AnsiChar): integer; var ext: PUtf8Char; begin result := -1; if (p <> nil) and (high(extup) > 0) then begin ext := nil; repeat if p^ = sepChar then ext := p; // get last '.' position from p into ext inc(p); until p^ = #0; if ext <> nil then result := IdemPCharArray(ext, extup); end; end; function PosCharAny(Str: PUtf8Char; Characters: PAnsiChar): PUtf8Char; var s: PAnsiChar; c: AnsiChar; begin if (Str <> nil) and (Characters <> nil) and (Characters^ <> #0) then repeat c := Str^; if c = #0 then break; result := Str; s := Characters; repeat if s^ = c then exit; inc(s); until s^ = #0; inc(Str); until false; result := nil; end; function PosI(uppersubstr: PUtf8Char; const str: RawUtf8): PtrInt; var u: AnsiChar; {$ifdef CPUX86NOTPIC} table: TNormTable absolute NormToUpperAnsi7; {$else} table: PNormTable; {$endif CPUX86NOTPIC} begin if uppersubstr <> nil then begin {$ifndef CPUX86NOTPIC} table := @NormToUpperAnsi7; {$endif CPUX86NOTPIC} u := uppersubstr^; for result := 1 to Length(str) do if table[str[result]] = u then if IdemPCharAnsi(table, @PUtf8Char(pointer(str))[result], PAnsiChar(uppersubstr) + 1) then exit; end; result := 0; end; function StrPosI(uppersubstr, str: PUtf8Char): PUtf8Char; var u: AnsiChar; {$ifdef CPUX86NOTPIC} table: TNormTable absolute NormToUpperAnsi7; {$else} table: PNormTable; {$endif CPUX86NOTPIC} begin if (uppersubstr <> nil) and (str <> nil) then begin {$ifndef CPUX86NOTPIC} table := @NormToUpperAnsi7; {$endif CPUX86NOTPIC} u := uppersubstr^; inc(uppersubstr); result := str; while result^ <> #0 do begin if table[result^] = u then if IdemPCharAnsi(table, result + 1, PAnsiChar(uppersubstr)) then exit; inc(result); end; end; result := nil; end; function PosIU(substr: PUtf8Char; const str: RawUtf8): integer; var p: PUtf8Char; begin if (substr <> nil) and (str <> '') then begin p := pointer(str); repeat if GetNextUtf8Upper(p) = ord(substr^) then if IdemPCharU(p, substr + 1) then begin result := p - pointer(str); exit; end; until p^ = #0; end; result := 0; end; function strspn(s, accept: pointer): integer; // FPC is efficient at compiling this code, but is SLOWER when inlined var p: PCardinal; c: AnsiChar; d: cardinal; begin // returns size of initial segment of s which are in accept result := 0; repeat c := PAnsiChar(s)[result]; if c = #0 then break; p := accept; repeat // stop as soon as we find any character not from accept d := p^; inc(p); if AnsiChar(d) = c then break else if AnsiChar(d) = #0 then exit; d := d shr 8; if AnsiChar(d) = c then break else if AnsiChar(d) = #0 then exit; d := d shr 8; if AnsiChar(d) = c then break else if AnsiChar(d) = #0 then exit; d := d shr 8; if AnsiChar(d) = c then break else if AnsiChar(d) = #0 then exit; until false; inc(result); until false; end; function strcspn(s, reject: pointer): integer; // FPC is efficient at compiling this code, but is SLOWER when inlined var p: PCardinal; c: AnsiChar; d: cardinal; begin // returns size of initial segment of s which are not in reject result := 0; repeat c := PAnsiChar(s)[result]; if c = #0 then break; p := reject; repeat // stop as soon as we find any character from reject d := p^; inc(p); if AnsiChar(d) = c then exit else if AnsiChar(d) = #0 then break; d := d shr 8; if AnsiChar(d) = c then exit else if AnsiChar(d) = #0 then break; d := d shr 8; if AnsiChar(d) = c then exit else if AnsiChar(d) = #0 then break; d := d shr 8; if AnsiChar(d) = c then exit else if AnsiChar(d) = #0 then break; until false; inc(result); until false; end; function StrCompL(P1, P2: pointer; L, Default: PtrInt): PtrInt; var i: PtrInt; begin i := 0; repeat result := PByteArray(P1)[i] - PByteArray(P2)[i]; if result = 0 then begin inc(i); if i < L then continue else break; end; exit; until false; result := Default; end; function StrCompIL(P1, P2: pointer; L, Default: PtrInt): PtrInt; var i: PtrInt; {$ifdef CPUX86NOTPIC} tab: TNormTableByte absolute NormToUpperAnsi7Byte; {$else} tab: PByteArray; // faster on PIC/ARM and x86_64 {$endif CPUX86NOTPIC} begin i := 0; {$ifndef CPUX86NOTPIC} tab := @NormToUpperAnsi7Byte; {$endif CPUX86NOTPIC} repeat if tab[PByteArray(P1)[i]] = tab[PByteArray(P2)[i]] then begin inc(i); if i < L then continue else break; end; result := PByteArray(P1)[i] - PByteArray(P2)[i]; exit; until false; result := Default; end; function StrICompNotNil(Str1, Str2: pointer; Up: PNormTableByte): PtrInt; var C1, C2: byte; // integer/PtrInt are actually slower on FPC begin result := PtrInt(PtrUInt(Str2)) - PtrInt(PtrUInt(Str1)); if result <> 0 then begin repeat C1 := Up[PByteArray(Str1)[0]]; C2 := Up[PByteArray(Str1)[result]]; inc(PByte(Str1)); until (C1 = 0) or (C1 <> C2); result := C1 - C2; end; end; function StrICompLNotNil(Str1, Str2: pointer; Up: PNormTableByte; L: PtrInt): PtrInt; begin result := 0; repeat if Up[PByteArray(Str1)[result]] = Up[PByteArray(Str2)[result]] then begin inc(result); if result < L then continue else break; end; result := PByteArray(Str1)[result] - PByteArray(Str2)[result]; exit; until false; result := 0; end; function StrILNotNil(Str1, Str2: pointer; Up: PNormTableByte; L: PtrInt): PtrInt; begin result := 0; repeat if Up[PByteArray(Str1)[result]] <> Up[PByteArray(Str2)[result]] then exit; inc(result); until result = L; end; function StrIComp(Str1, Str2: pointer): PtrInt; var C1, C2: byte; // integer/PtrInt are actually slower on FPC {$ifdef CPUX86NOTPIC} table: TNormTableByte absolute NormToUpperAnsi7Byte; {$else} table: PByteArray; {$endif CPUX86NOTPIC} begin result := PtrInt(PtrUInt(Str2)) - PtrInt(PtrUInt(Str1)); if result <> 0 then if Str1 <> nil then if Str2 <> nil then begin {$ifndef CPUX86NOTPIC} table := @NormToUpperAnsi7Byte; {$endif CPUX86NOTPIC} repeat C1 := table[PByteArray(Str1)[0]]; C2 := table[PByteArray(Str1)[result]]; inc(PByte(Str1)); until (C1 = 0) or (C1 <> C2); result := C1 - C2; end else // Str2='' result := 1 else // Str1='' result := -1; end; function GetLineContains(p, pEnd, up: PUtf8Char): boolean; var i: PtrInt; {$ifdef CPUX86NOTPIC} table: TNormTable absolute NormToUpperAnsi7Byte; {$else} table: PNormTable; {$endif CPUX86NOTPIC} label Fnd1, LF1, Fnd2, LF2, Ok; // ugly but fast begin if (p <> nil) and (up <> nil) then begin {$ifndef CPUX86NOTPIC} table := @NormToUpperAnsi7; {$endif CPUX86NOTPIC} if pEnd = nil then repeat if p^ <= #13 then // p^ into a temp var is slower goto LF1 else if table[p^] = up^ then goto Fnd1; inc(p); continue; LF1: if (p^ = #0) or (p^ = #13) or (p^ = #10) then break; inc(p); continue; Fnd1: i := 0; repeat inc(i); if up[i] <> #0 then if up[i] = table[p[i]] then continue else break else begin Ok: result := true; // found exit; end; until false; inc(p); until false else repeat if p >= pEnd then break; if p^ <= #13 then goto LF2 else if table[p^] = up^ then goto Fnd2; inc(p); continue; LF2: if (p^ = #13) or (p^ = #10) then break; inc(p); continue; Fnd2: i := 0; repeat inc(i); if up[i] = #0 then goto Ok; if p + i >= pEnd then break; until up[i] <> table[p[i]]; inc(p); until false; end; result := false; end; function ContainsUtf8(p, up: PUtf8Char): boolean; var u: PByte; begin if (p <> nil) and (up <> nil) and (up^ <> #0) then begin result := true; repeat u := pointer(up); repeat if GetNextUtf8Upper(p) <> u^ then break else inc(u); if u^ = 0 then exit; // up^ was found inside p^ until false; p := FindNextUtf8WordBegin(p); until p = nil; end; result := false; end; function GetNextUtf8Upper(var U: PUtf8Char): Ucs4CodePoint; begin result := ord(U^); if result = 0 then exit; if result <= 127 then begin inc(U); result := NormToUpperByte[result]; exit; end; result := UTF8_TABLE.GetHighUtf8Ucs4(U); if (result <= 255) and (WinAnsiConvert.AnsiToWide[result] <= 255) then result := NormToUpperByte[result]; end; function FindNextUtf8WordBegin(U: PUtf8Char): PUtf8Char; var c: cardinal; V: PUtf8Char; begin result := nil; repeat c := GetNextUtf8Upper(U); if c = 0 then exit; until (c >= 127) or not (tcWord in TEXT_BYTES[c]); // not ['0'..'9', 'a'..'z', 'A'..'Z'] repeat V := U; c := GetNextUtf8Upper(U); if c = 0 then exit; until (c < 127) and (tcWord in TEXT_BYTES[c]); result := V; end; function AnsiICompW(u1, u2: PWideChar): PtrInt; var C1, C2: PtrInt; {$ifdef CPUX86NOTPIC} table: TNormTableByte absolute NormToUpperAnsi7Byte; {$else} table: PByteArray; {$endif CPUX86NOTPIC} begin if u1 <> u2 then if u1 <> nil then if u2 <> nil then begin {$ifndef CPUX86NOTPIC} table := @NormToUpperAnsi7Byte; {$endif CPUX86NOTPIC} repeat C1 := PtrInt(u1^); C2 := PtrInt(u2^); result := C1 - C2; if result <> 0 then begin if (C1 > 255) or (C2 > 255) then exit; result := table[C1] - table[C2]; if result <> 0 then exit; end; if (C1 = 0) or (C2 = 0) then break; inc(u1); inc(u2); until false; end else result := 1 else // u2='' result := -1 else // u1='' result := 0; // u1=u2 end; function AnsiIComp(Str1, Str2: pointer): PtrInt; var C1, C2: byte; // integer/PtrInt are actually slower on FPC lookupper: PByteArray; // better x86-64 / PIC asm generation begin result := PtrInt(PtrUInt(Str2)) - PtrInt(PtrUInt(Str1)); if result <> 0 then if Str1 <> nil then if Str2 <> nil then begin lookupper := @NormToUpperByte; repeat C1 := lookupper[PByteArray(Str1)[0]]; C2 := lookupper[PByteArray(Str1)[result]]; inc(PByte(Str1)); until (C1 = 0) or (C1 <> C2); result := C1 - C2; end else result := 1 else // Str2='' result := -1; // Str1='' end; function SortDynArrayAnsiStringI(const A, B): integer; begin result := StrIComp(PUtf8Char(A), PUtf8Char(B)); // very agressively inlined end; function SortDynArrayPUtf8CharI(const A, B): integer; begin result := StrIComp(PUtf8Char(A), PUtf8Char(B)); end; function SortDynArrayStringI(const A, B): integer; begin {$ifdef UNICODE} result := AnsiICompW(PWideChar(A), PWideChar(B)); {$else} result := StrIComp(PUtf8Char(A), PUtf8Char(B)); {$endif UNICODE} end; function SortDynArrayUnicodeStringI(const A, B): integer; begin result := AnsiICompW(PWideChar(A), PWideChar(B)); end; function ConvertCaseUtf8(P: PUtf8Char; const Table: TNormTableByte): PtrInt; var D, S: PUtf8Char; c: PtrUInt; extra, i: PtrInt; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} begin result := 0; if P = nil then exit; {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} D := P; repeat c := byte(P[0]); inc(P); if c = 0 then break; if c <= 127 then begin D[result] := AnsiChar(Table[c]); inc(result); end else begin extra := utf8.Lookup[c]; if extra = UTF8_INVALID then exit; // invalid leading byte i := 0; repeat if byte(P[i]) and $c0 <> $80 then exit; // invalid input content c := (c shl 6) + byte(P[i]); inc(i); until i = extra; with utf8.Extra[extra] do begin dec(c, offset); if c < minimum then exit; // invalid input content end; if (c <= 255) and (Table[c] <= 127) then begin D[result] := AnsiChar(Table[c]); inc(result); inc(P, extra); continue; end; S := P - 1; inc(P, extra); inc(extra); MoveByOne(S, D + result, extra); inc(result, extra); end; until false; end; function UpperCaseU(const S: RawUtf8): RawUtf8; var LS, LD: integer; begin LS := length(S); FastSetString(result, pointer(S), LS); LD := ConvertCaseUtf8(pointer(result), NormToUpperByte); if LS <> LD then SetLength(result, LD); end; function LowerCaseU(const S: RawUtf8): RawUtf8; var LS, LD: integer; begin LS := length(S); FastSetString(result, pointer(S), LS); LD := ConvertCaseUtf8(pointer(result), NormToLowerByte); if LS <> LD then SetLength(result, LD); end; function Utf8IComp(u1, u2: PUtf8Char): PtrInt; var c2: PtrInt; {$ifdef CPUX86NOTPIC} table: TNormTableByte absolute NormToUpperByte; {$else} table: PByteArray; {$endif CPUX86NOTPIC} label c2low; begin // fast UTF-8 comparison using the NormToUpper[] array for all 8-bit values {$ifndef CPUX86NOTPIC} table := @NormToUpperByte; {$endif CPUX86NOTPIC} if u1 <> u2 then if u1 <> nil then if u2 <> nil then repeat result := ord(u1^); c2 := ord(u2^); if result <= 127 then if result <> 0 then begin inc(u1); result := table[result]; if c2 <= 127 then begin c2low: if c2 = 0 then exit; // u1>u2 -> return u1^ inc(u2); dec(result, table[c2]); if result <> 0 then exit; continue; end; end else begin // u1^=#0 -> end of u1 reached if c2 <> 0 then // end of u2 reached -> u1=u2 -> return 0 result := -1; // u1 0 then exit; until false else result := 1 // u2='' else result := -1 // u1='' else result := 0; // u1=u2 end; function Utf8ILComp(u1, u2: PUtf8Char; L1, L2: cardinal): PtrInt; var c2: PtrInt; extra, i: integer; {$ifdef CPUX86NOTPIC} table: TNormTableByte absolute NormToUpperByte; utf8: TUtf8Table absolute UTF8_TABLE; {$else} table: PByteArray; utf8: PUtf8Table; {$endif CPUX86NOTPIC} label neg, pos; begin // fast UTF-8 comparison using the NormToUpper[] array for all 8-bit values {$ifndef CPUX86NOTPIC} table := @NormToUpperByte; utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} if u1 <> u2 then if (u1 <> nil) and (L1 <> 0) then if (u2 <> nil) and (L2 <> 0) then repeat result := ord(u1^); c2 := ord(u2^); inc(u1); dec(L1); if result <= 127 then begin result := table[result]; if c2 <= 127 then begin // 'a'..'z' / 'A'..'Z' case insensitive comparison dec(result, table[c2]); dec(L2); inc(u2); if result <> 0 then // found unmatching char exit else if L1 <> 0 then if L2 <> 0 then // L1>0 and L2>0 -> next char continue else // L1>0 and L2=0 -> u1>u2 goto pos else if L2 <> 0 then // L1=0 and L2>0 -> u1 u1=u2 -> returns 0 exit; end; end else begin // Win-1252 case insensitive comparison extra := utf8.Lookup[result]; if extra = UTF8_INVALID then // invalid leading byte goto neg; dec(L1, extra); if integer(L1) < 0 then goto neg; i := 0; repeat result := result shl 6; inc(result, ord(u1[i])); inc(i); until i = extra; inc(u1, extra); dec(result, utf8.Extra[extra].offset); if result and $ffffff00 = 0 then // 8-bit to upper conversion, 32-bit as is result := table[result]; end; // here result=NormToUpper[u1^] inc(u2); dec(L2); if c2 <= 127 then begin dec(result, table[c2]); if result <> 0 then // found unmatching char exit; end else begin extra := utf8.Lookup[c2]; if extra = UTF8_INVALID then goto pos; dec(L2, extra); if integer(L2) < 0 then goto pos; i := 0; repeat c2 := c2 shl 6; inc(c2, ord(u2[i])); inc(i); until i = extra; inc(u2, extra); dec(c2, utf8.Extra[extra].offset); if c2 and $ffffff00 = 0 then // 8-bit to upper dec(result, table[c2]) else // returns 32-bit diff dec(result, c2); if result <> 0 then // found unmatching char exit; end; // here we have result=NormToUpper[u2^]-NormToUpper[u1^]=0 if L1 = 0 then // test if we reached end of u1 or end of u2 if L2 = 0 then // u1=u2 exit else // u1u2 goto pos; until false else pos: // u2='' or u1>u2 result := 1 else neg: // u1='' or u1 UpperValue^ then break; inc(UpperValue); if UpperValue^ = #0 then begin result := true; // UpperValue found! exit; end; inc(A); if A^ = #0 then exit; until false; // find beginning of next word repeat if A^ = #0 then exit else if not (tcWord in TEXT_CHARS[NormToUpper[A^]]) then break else inc(A); until false; until false; end; function FindUnicode(PW, Upper: PWideChar; UpperLen: PtrInt): boolean; var Start: PWideChar; w: PtrUInt; begin result := false; if (PW = nil) or (Upper = nil) then exit; repeat // go to beginning of next word repeat w := ord(PW^); if w = 0 then exit else if (w > 126) or (tcWord in TEXT_BYTES[w]) then Break; inc(PW); until false; Start := PW; // search end of word matching UpperLen characters repeat inc(PW); w := ord(PW^); until (PW - Start >= UpperLen) or (w = 0) or ((w < 127) and not (tcWord in TEXT_BYTES[w])); if PW - Start >= UpperLen then if Unicode_CompareString(Start, Upper, UpperLen, UpperLen, {ignorecase=}true) = 2 then begin result := true; // case-insensitive match found exit; end; // not found: go to end of current word repeat w := ord(PW^); if w = 0 then exit else if ((w < 127) and not (tcWord in TEXT_BYTES[w])) then Break; inc(PW); until false; until false; end; function FindUtf8(U: PUtf8Char; UpperValue: PAnsiChar): boolean; var ValueStart: PAnsiChar; c: PtrUInt; FirstChar: AnsiChar; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} label Next; begin result := false; if (U = nil) or (UpperValue = nil) then exit; // handles 8-bits WinAnsi chars inside UTF-8 encoded data {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} FirstChar := UpperValue^; ValueStart := UpperValue + 1; repeat // test beginning of word repeat c := byte(U^); inc(U); if c = 0 then exit; if c <= 127 then begin if tcWord in TEXT_BYTES[c] then if PAnsiChar(@NormToUpper)[c] <> FirstChar then goto Next else break; end else if c and $20 = 0 then begin // fast direct process of $0..$7ff codepoints including accents c := ((c shl 6) + byte(U^)) - $3080; inc(U); if c <= 255 then begin c := NormToUpperByte[c]; if tcWord in TEXT_BYTES[c] then if AnsiChar(c) <> FirstChar then goto Next else break; end; end else begin c := utf8.Lookup[c]; if c = UTF8_INVALID then exit else // just ignore surrogates for soundex inc(U, c); end; until false; // here we had the first char match -> check if this word match UpperValue UpperValue := ValueStart; repeat if UpperValue^ = #0 then begin result := true; // UpperValue found! exit; end; c := byte(U^); inc(U); // next chars if c = 0 then exit else if c <= 127 then begin if PAnsiChar(@NormToUpper)[c] <> UpperValue^ then break; end else if c and $20 = 0 then begin c := ((c shl 6) + byte(U^)) - $3080; inc(U); if (c > 255) or (PAnsiChar(@NormToUpper)[c] <> UpperValue^) then break; end else begin c := utf8.Lookup[c]; if c = UTF8_INVALID then exit else inc(U, c); break; end; inc(UpperValue); until false; Next: // find beginning of next word U := FindNextUtf8WordBegin(U); until U = nil; end; function UpperCopy255(dest: PAnsiChar; const source: RawUtf8): PAnsiChar; begin if source <> '' then result := UpperCopy255Buf( dest, pointer(source), PStrLen(PAnsiChar(pointer(source)) - _STRLEN)^) else result := dest; end; function UpperCopy255Buf(dest: PAnsiChar; source: PUtf8Char; sourceLen: PtrInt): PAnsiChar; var i, c, d {$ifdef CPU64}, _80, _61, _7b {$endif}: PtrUInt; begin if sourceLen <> 0 then begin if sourceLen > 248 then sourceLen := 248; // avoid buffer overflow // we allow to copy up to 3/7 more chars in Dest^ since its size is 255 {$ifdef CPU64} // unbranched uppercase conversion of 8 chars blocks _80 := PtrUInt($8080808080808080); // use registers for constants _61 := $6161616161616161; _7b := $7b7b7b7b7b7b7b7b; for i := 0 to (sourceLen - 1) shr 3 do begin c := PPtrUIntArray(source)^[i]; d := c or _80; PPtrUIntArray(dest)^[i] := c - ((d - _61) and not (d - _7b)) and ((not c) and _80) shr 2; end; {$else} // unbranched uppercase conversion of 4 chars blocks for i := 0 to (sourceLen - 1) shr 2 do begin c := PPtrUIntArray(source)^[i]; d := c or PtrUInt($80808080); PPtrUIntArray(dest)^[i] := c - ((d - PtrUInt($61616161)) and not(d - PtrUInt($7b7b7b7b))) and ((not c) and PtrUInt($80808080)) shr 2; end; {$endif CPU64} end; result := dest + sourceLen; // return the exact size end; function UpperCopyWin255(dest: PWinAnsiChar; const source: RawUtf8): PWinAnsiChar; var i, L: PtrInt; {$ifdef CPUX86NOTPIC} tab: TNormTableByte absolute NormToUpperByte; {$else} tab: PByteArray; // faster on PIC/ARM and x86_64 {$endif CPUX86NOTPIC} begin if source = '' then result := dest else begin L := PStrLen(PAnsiChar(pointer(source)) - _STRLEN)^; if L > 250 then L := 250; // avoid buffer overflow result := dest + L; {$ifndef CPUX86NOTPIC} tab := @NormToUpperByte; {$endif CPUX86NOTPIC} for i := 0 to L - 1 do dest[i] := AnsiChar(tab[PByteArray(source)[i]]); end; end; function Utf8UpperCopy(Dest, Source: PUtf8Char; SourceChars: cardinal): PUtf8Char; var c: cardinal; endSource, endSourceBy4, up: PUtf8Char; extra, i: PtrInt; {$ifdef CPUX86NOTPIC} utf8: TUtf8Table absolute UTF8_TABLE; {$else} utf8: PUtf8Table; {$endif CPUX86NOTPIC} label by1, by4, set1; // ugly but faster begin if (Source <> nil) and (Dest <> nil) then begin {$ifndef CPUX86NOTPIC} utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} // first handle trailing 7-bit ASCII chars, by quad (Sha optimization) endSource := Source + SourceChars; endSourceBy4 := endSource - 4; up := @NormToUpper; if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif} (Source <= endSourceBy4) then repeat c := PCardinal(Source)^; if c and $80808080 <> 0 then goto by1; // break on first non ASCII quad by4: inc(Source, 4); Dest[0] := up[ToByte(c)]; Dest[1] := up[ToByte(c shr 8)]; Dest[2] := up[ToByte(c shr 16)]; Dest[3] := up[ToByte(c shr 24)]; inc(Dest, 4); until Source > endSourceBy4; // generic loop, handling one UCS4 CodePoint per iteration if Source < endSource then repeat by1: c := byte(Source^); inc(Source); if c <= 127 then begin Dest^ := up[c]; set1: inc(Dest); if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif} (Source <= endSourceBy4) then begin c := PCardinal(Source)^; if c and $80808080 = 0 then goto by4; continue; end else if Source < endSource then continue else break; end else begin extra := utf8.Lookup[c]; if (extra = UTF8_INVALID) or (Source + extra > endSource) then break; i := 0; repeat c := (c shl 6) + byte(Source[i]); inc(i) until i = extra; with utf8.Extra[extra] do begin dec(c, offset); if c < minimum then break; // invalid input content end; if (c <= 255) and (up[c] <= #127) then begin Dest^ := up[c]; inc(Source, extra); goto set1; end; Dest^ := Source[-1]; repeat // here we now extra>0 - just copy UTF-8 input untouched inc(Dest); Dest^ := Source^; inc(Source); dec(extra); if extra = 0 then goto set1; until false; end; until false; end; result := Dest; end; function Utf8UpperCopy255(dest: PAnsiChar; const source: RawUtf8): PUtf8Char; var L: integer; begin L := length(source); if L > 0 then begin if L > 250 then L := 250; // avoid buffer overflow result := Utf8UpperCopy(pointer(dest), pointer(source), L); end else result := pointer(dest); end; function UpperCopy255W(dest: PAnsiChar; const source: SynUnicode): PAnsiChar; begin result := UpperCopy255W(dest, pointer(source), length(source)); end; function UpperCopy255W(dest: PAnsiChar; source: PWideChar; L: PtrInt): PAnsiChar; var c: PtrUInt; d: byte; lookupper: PByteArray; // better x86-64 / PIC asm generation begin if L > 0 then begin if L > 250 then L := 250; // avoid buffer overflow lookupper := @NormToUpperAnsi7Byte; repeat c := PWord(source)^; d := ord('?'); if c < 255 then d := lookupper[c]; dest^ := AnsiChar(d); inc(dest); inc(source); dec(L); until L = 0; end; result := dest; end; function UpperCopy(dest: PAnsiChar; const source: RawUtf8): PAnsiChar; var s: PAnsiChar; c: byte; lookupper: PByteArray; // better x86-64 / PIC asm generation begin s := pointer(source); if s <> nil then begin lookupper := @NormToUpperAnsi7Byte; repeat c := lookupper[ord(s^)]; if c = 0 then break; dest^ := AnsiChar(c); inc(s); inc(dest); until false; end; result := dest; end; function UpperCopyShort(dest: PAnsiChar; const source: ShortString): PAnsiChar; var s: PByteArray; i: PtrInt; lookupper: PByteArray; // better x86-64 / PIC asm generation begin s := @source; lookupper := @NormToUpperAnsi7Byte; for i := 1 to s[0] do begin dest^ := AnsiChar(lookupper[s[i]]); inc(dest); end; result := dest; end; function UpperCaseUnicode(const S: RawUtf8): RawUtf8; var tmp: TSynTempBuffer; len: integer; begin if S = '' then begin result := ''; exit; end; tmp.Init(length(s) * 2); len := Utf8ToWideChar(tmp.buf, pointer(S), length(S)) shr 1; RawUnicodeToUtf8(tmp.buf, Unicode_InPlaceUpper(tmp.buf, len), result); tmp.Done; end; function UpperCaseSynUnicode(const S: SynUnicode): SynUnicode; begin {$ifdef UNICODE} result := SysUtils.UpperCase(S); {$else} {$ifdef HASVARUSTRING} result := UnicodeUpperCase(S); {$else} result := WideUpperCase(s); {$endif HASVARUSTRING} {$endif UNICODE} end; function LowerCaseSynUnicode(const S: SynUnicode): SynUnicode; begin {$ifdef UNICODE} result := SysUtils.LowerCase(S); {$else} {$ifdef HASVARUSTRING} result := UnicodeLowerCase(S); {$else} result := WideLowerCase(s); {$endif HASVARUSTRING} {$endif UNICODE} end; function LowerCaseUnicode(const S: RawUtf8): RawUtf8; var tmp: TSynTempBuffer; len: integer; begin if S = '' then begin result := ''; exit; end; tmp.Init(length(s) * 2); len := Utf8ToWideChar(tmp.buf, pointer(S), length(S)) shr 1; RawUnicodeToUtf8(tmp.buf, Unicode_InPlaceLower(tmp.buf, len),result); tmp.Done; end; function IsCaseSensitive(const S: RawUtf8): boolean; begin result := IsCaseSensitive(pointer(S), length(S)); end; function IsCaseSensitive(P: PUtf8Char; PLen: PtrInt): boolean; begin result := true; if (P <> nil) and (PLen > 0) then repeat if ord(P^) in [ord('a')..ord('z'), ord('A')..ord('Z')] then exit; inc(P); dec(PLen); until PLen = 0; result := false; end; procedure CaseCopy(Text: PUtf8Char; Len: PtrInt; Table: PNormTable; var Dest: RawUtf8); var i: PtrInt; tmp: PAnsiChar; begin tmp := FastNewString(Len, CP_UTF8); for i := 0 to Len - 1 do tmp[i] := Table[Text[i]]; // branchless conversion FastAssignNew(Dest, tmp); end; procedure CaseSelf(var S: RawUtf8; Table: PNormTable); var i: PtrInt; P: PUtf8Char; begin P := UniqueRawUtf8(S); for i := 0 to length(S) - 1 do P[i] := Table[P[i]]; // branchless conversion end; function UpperCase(const S: RawUtf8): RawUtf8; begin CaseCopy(pointer(S), length(S), @NormToUpperAnsi7, result); end; procedure UpperCaseCopy(Text: PUtf8Char; Len: PtrInt; var Dest: RawUtf8); begin CaseCopy(Text, Len, @NormToUpperAnsi7, Dest); end; procedure UpperCaseCopy(const Source: RawUtf8; var Dest: RawUtf8); begin CaseCopy(pointer(Source), length(Source), @NormToUpperAnsi7, Dest); end; procedure UpperCaseSelf(var S: RawUtf8); begin CaseSelf(S, @NormToUpperAnsi7); end; function LowerCase(const S: RawUtf8): RawUtf8; begin CaseCopy(pointer(S), length(S), @NormToLowerAnsi7, result); end; procedure LowerCaseCopy(Text: PUtf8Char; Len: PtrInt; var Dest: RawUtf8); begin CaseCopy(Text, Len, @NormToLowerAnsi7, Dest); end; procedure LowerCaseSelf(var S: RawUtf8); begin CaseSelf(S, @NormToLowerAnsi7); end; function PosExIPas(pSub, p: PUtf8Char; Offset: PtrUInt; Lookup: PNormTable): PtrInt; var len, lenSub: PtrInt; ch: AnsiChar; pStart, pStop: PUtf8Char; label s2, s6, tt, t0, t1, t2, t3, t4, s0, s1, fnd, quit; begin result := 0; if (p = nil) or (pSub = nil) or (PtrInt(Offset) <= 0) or (Lookup = nil) then goto quit; len := PStrLen(p - _STRLEN)^; lenSub := PStrLen(pSub - _STRLEN)^ - 1; if (len < lenSub + PtrInt(Offset)) or (lenSub < 0) then goto quit; pStop := p + len; inc(p, lenSub); inc(pSub, lenSub); pStart := p; p := @p[Offset + 3]; ch := Lookup[pSub[0]]; lenSub := -lenSub; if p < pStop then goto s6; dec(p, 4); goto s2; s6: // check 6 chars per loop iteration with O(1) case comparison if ch = Lookup[p[-4]] then goto t4; if ch = Lookup[p[-3]] then goto t3; if ch = Lookup[p[-2]] then goto t2; if ch = Lookup[p[-1]] then goto t1; s2:if ch = Lookup[p[0]] then goto t0; s1:if ch = Lookup[p[1]] then goto tt; s0:inc(p, 6); if p < pStop then goto s6; dec(p, 4); if p >= pStop then goto quit; goto s2; t4:dec(p, 2); t2:dec(p, 2); goto t0; t3:dec(p, 2); t1:dec(p, 2); tt:len := lenSub; if lenSub <> 0 then repeat if (Lookup[pSub[len]] <> Lookup[p[len + 1]]) or (Lookup[pSub[len + 1]] <> Lookup[p[len + 2]]) then goto s0; inc(len, 2); until len >= 0; inc(p, 2); if p <= pStop then goto fnd; goto quit; t0:len := lenSub; if lenSub <> 0 then repeat if (Lookup[pSub[len]] <> Lookup[p[len]]) or (Lookup[pSub[len + 1]] <> Lookup[p[len + 1]]) then goto s1; inc(len, 2); until len >= 0; inc(p); fnd: result := p - pStart; quit: end; function PosExI(const SubStr, S: RawUtf8; Offset: PtrUInt): PtrInt; begin result := PosExIPas(pointer(SubStr), pointer(S), Offset, @NormToUpperAnsi7); end; function PosExI(const SubStr, S: RawUtf8; Offset: PtrUInt; Lookup: PNormTable): PtrInt; begin if Lookup = nil then {$ifdef CPUX86} result := PosEx(SubStr, S, Offset) {$else} result := PosExPas(pointer(SubStr), pointer(S), Offset) {$endif CPUX86} else result := PosExIPas(pointer(SubStr), pointer(S), Offset, Lookup); end; { ************ UTF-8 String Manipulation Functions } function StartWithExact(const text, textStart: RawUtf8): boolean; var l: PtrInt; begin l := length(textStart); result := (length(text) >= l) and CompareMem(pointer(text), pointer(textStart), l); end; function EndWithExact(const text, textEnd: RawUtf8): boolean; var l, o: PtrInt; begin l := length(textEnd); o := length(text) - l; result := (o >= 0) and CompareMem(PUtf8Char(pointer(text)) + o, pointer(textEnd), l); end; function GetNextLine(source: PUtf8Char; out next: PUtf8Char; andtrim: boolean): RawUtf8; var beg: PUtf8Char; begin if source = nil then begin {$ifdef FPC} FastAssignNew(result); {$else} result := ''; {$endif FPC} next := source; exit; end; if andtrim then // optional trim left while source^ in [#9, ' '] do inc(source); beg := source; repeat // just here to avoid a goto if source[0] > #13 then if source[1] > #13 then if source[2] > #13 then if source[3] > #13 then begin inc(source, 4); // fast process 4 chars per loop continue; end else inc(source, 3) else inc(source, 2) else inc(source); case source^ of #0: next := nil; #10: next := source + 1; #13: if source[1] = #10 then next := source + 2 else next := source + 1; else begin inc(source); continue; end; end; if andtrim then // optional trim right while (source > beg) and (source[-1] in [#9, ' ']) do dec(source); FastSetString(result, beg, source - beg); exit; until false; end; function LeftU(const S: RawUtf8; n: PtrInt): RawUtf8; begin result := Copy(S, 1, n); end; function RightU(const S: RawUtf8; n: PtrInt): RawUtf8; var L: PtrInt; begin L := length(S); if n > L then n := L; result := Copy(S, L + 1 - n, n); end; function TrimLeft(const S: RawUtf8): RawUtf8; var i, l: PtrInt; begin l := Length(S); i := 1; while (i <= l) and (S[i] <= ' ') do inc(i); if i = 1 then result := S else FastSetString(result, @PByteArray(S)[i - 1], l - i); end; function TrimRight(const S: RawUtf8): RawUtf8; var i: PtrInt; begin i := Length(S); while (i > 0) and (S[i] <= ' ') do dec(i); FastSetString(result, pointer(S), i); end; procedure TrimLeftLines(var S: RawUtf8); var P, D: PUtf8Char; begin if S = '' then exit; P := UniqueRawUtf8(S); D := P; // in-place process repeat while (P^ <= ' ') and (P^ <> #0) do inc(P); while not (P^ in [#0, #10, #13]) do begin D^ := P^; inc(P); inc(D); end; if P^ = #0 then break; D^ := #10; inc(D); until false; if D = pointer(S) then S := '' else FakeLength(S, D); // no SetLength needed end; procedure TrimChars(var S: RawUtf8; Left, Right: PtrInt); var P: PUtf8Char; begin P := pointer(S); if P = nil then exit; if Left < 0 then Left := 0; if Right < 0 then Right := 0; inc(Right, Left); if Right = 0 then exit; // nothing to trim Right := PStrLen(P - _STRLEN)^ - Right; // compute new length if Right > 0 then if PStrCnt(P - _STRCNT)^ = 1 then // RefCnt=1 ? begin PStrLen(P - _STRLEN)^ := Right; // we can modify it in-place if Left <> 0 then MoveFast(P[Left], P^, Right); P[Right] := #0; end else FastSetString(S, P + Left, Right) // create a new unique string else FastAssignNew(S); end; function SplitRight(const Str: RawUtf8; SepChar: AnsiChar; LeftStr: PRawUtf8): RawUtf8; var i: PtrInt; begin for i := length(Str) downto 1 do if Str[i] = SepChar then begin FastSetString(result, @PByteArray(Str)[i], length(Str) - i); if LeftStr <> nil then FastSetString(LeftStr^, pointer(Str), i - 1); exit; end; result := Str; if LeftStr <> nil then FastAssignNew(LeftStr^); end; function SplitRights(const Str, SepChar: RawUtf8): RawUtf8; var i, j, sep: PtrInt; c: AnsiChar; begin sep := length(SepChar); if sep > 0 then if sep = 1 then result := SplitRight(Str, SepChar[1]) else begin for i := length(Str) downto 1 do begin c := Str[i]; for j := 1 to sep do if c = SepChar[j] then begin FastSetString(result, @PByteArray(Str)[i], length(Str) - i); exit; end; end; end; result := Str; end; function Split(const Str, SepStr: RawUtf8; var LeftStr, RightStr: RawUtf8; ToUpperCase: boolean): boolean; var i: PtrInt; tmp: pointer; // may be called as Split(Str,SepStr,Str,RightStr) begin if length(SepStr) = 1 then i := PosExChar(SepStr[1], Str) // may use SSE2 on i386/x86_64 else i := PosEx(SepStr, Str); if i = 0 then begin LeftStr := Str; RightStr := ''; result := false; end else begin dec(i); tmp := nil; FastSetString(RawUtf8(tmp), pointer(Str), i); inc(i, length(SepStr)); FastSetString(RightStr, @PByteArray(Str)[i], length(Str) - i); FastAssignNew(LeftStr, tmp); result := true; end; if ToUpperCase then begin UpperCaseSelf(LeftStr); UpperCaseSelf(RightStr); end; end; function Split(const Str, SepStr: RawUtf8; var LeftStr: RawUtf8; ToUpperCase: boolean): RawUtf8; begin Split(Str, SepStr, LeftStr, result, ToUpperCase); end; function Split(const Str: RawUtf8; const SepStr: array of RawUtf8; const DestPtr: array of PRawUtf8): PtrInt; var s, i, j: PtrInt; P: pointer; begin j := 1; result := 0; s := 0; if high(SepStr) >= 0 then while result <= high(DestPtr) do begin P := @PByteArray(Str)[j - 1]; i := PosEx(SepStr[s], Str, j); if i = 0 then begin if DestPtr[result] <> nil then FastSetString(DestPtr[result]^, P, length(Str) - j + 1); inc(result); break; end; if DestPtr[result] <> nil then FastSetString(DestPtr[result]^, P, i - j); inc(result); if s < high(SepStr) then inc(s); j := i + 1; end; for i := result to high(DestPtr) do if DestPtr[i] <> nil then FastAssignNew(DestPtr[i]^); end; function IsVoid(const text: RawUtf8): boolean; var i: PtrInt; begin result := false; for i := 1 to length(text) do if text[i] > ' ' then exit; result := true; end; function TrimControlChars(const text: RawUtf8): RawUtf8; var len, i, j, n: PtrInt; P: PAnsiChar; begin len := length(text); for i := 1 to len do if text[i] <= ' ' then begin n := i - 1; FastSetString(result, len); P := pointer(result); if n > 0 then MoveFast(pointer(text)^, P^, n); for j := i + 1 to len do if text[j] > ' ' then begin P[n] := text[j]; inc(n); end; FakeSetLength(result, n); exit; end; result := text; // no control char found end; function TrimChar(const text: RawUtf8; const exclude: TSynAnsicharSet): RawUtf8; var len, i, j, n: PtrInt; P: PAnsiChar; begin len := length(text); for i := 1 to len do if text[i] in exclude then begin n := i - 1; FastSetString(result, len - 1); P := pointer(result); if n > 0 then MoveFast(pointer(text)^, P^, n); for j := i + 1 to len do if not (text[j] in exclude) then begin P[n] := text[j]; inc(n); end; FakeSetLength(result, n); exit; end; result := text; // no exclude char found end; function TrimOneChar(const text: RawUtf8; exclude: AnsiChar): RawUtf8; var first, len, i: PtrInt; c: AnsiChar; P: PAnsiChar; begin len := length(text); first := ByteScanIndex(pointer(text), len, ord(exclude)); if first < 0 then begin result := text; // no exclude char found exit; end; FastSetString(result, len - 1); P := pointer(result); MoveFast(pointer(text)^, P^, first); inc(P, first); for i := first + 1 to len do begin c := text[i]; if c <> exclude then begin P^ := c; inc(P); end; end; FakeSetLength(result, P - pointer(result)); end; function OnlyChar(const text: RawUtf8; const only: TSynAnsicharSet): RawUtf8; var i: PtrInt; exclude: array[0..(SizeOf(only) shr POINTERSHR) - 1] of PtrInt; begin // reverse bits in local stack copy before calling TrimChar() for i := 0 to (SizeOf(only) shr POINTERSHR) - 1 do exclude[i] := not PPtrIntArray(@only)[i]; result := TrimChar(text, TSynAnsicharSet(exclude)); end; procedure FillZero(var secret: RawByteString); begin if secret <> '' then with PStrRec(Pointer(PtrInt(secret) - _STRRECSIZE))^ do if refCnt = 1 then // avoid GPF if const FillCharFast(pointer(secret)^, length, 0); FastAssignNew(secret); // dec refCnt end; procedure FillZero(var secret: RawUtf8); begin FillZero(RawByteString(secret)); end; procedure FillZero(var secret: SpiUtf8); begin FillZero(RawByteString(secret)); end; {$ifdef HASVARUSTRING} procedure FillZero(var secret: UnicodeString); begin if secret <> '' then with PStrRec(Pointer(PtrInt(secret) - _STRRECSIZE))^ do if refCnt = 1 then // avoid GPF if const FillCharFast(pointer(secret)^, length * SizeOf(WideChar), 0); Finalize(secret); // dec refCnt end; {$endif HASVARUSTRING} procedure FillZero(var secret: TBytes); begin if secret <> nil then with PDynArrayRec(Pointer(PtrInt(secret) - _DARECSIZE))^ do if refCnt = 1 then // avoid GPF if const FillCharFast(pointer(secret)^, length, 0); secret := nil; // dec refCnt end; function StringReplaceAllProcess(const S, OldPattern, NewPattern: RawUtf8; found: integer; Lookup: PNormTable): RawUtf8; var i, last, oldlen, newlen, sharedlen: PtrInt; posCount: integer; pos: TIntegerDynArray; src, dst: PAnsiChar; begin oldlen := length(OldPattern); newlen := length(NewPattern); SetLength(pos, 64); pos[0] := found; posCount := 1; repeat found := PosExI(OldPattern, S, found + oldlen, Lookup); if found = 0 then break; AddInteger(pos, posCount, found); until false; FastSetString(result, Length(S) + (newlen - oldlen) * posCount); last := 1; src := pointer(S); dst := pointer(result); for i := 0 to posCount - 1 do begin sharedlen := pos[i] - last; MoveFast(src^, dst^, sharedlen); inc(src, sharedlen + oldlen); inc(dst, sharedlen); if newlen > 0 then begin MoveByOne(pointer(NewPattern), dst, newlen); inc(dst, newlen); end; last := pos[i] + oldlen; end; MoveFast(src^, dst^, length(S) - last + 1); end; function StringReplaceAll(const S, OldPattern, NewPattern: RawUtf8; Lookup: PNormTable): RawUtf8; var found: integer; begin if (S = '') or (OldPattern = '') or (OldPattern = NewPattern) then result := S else begin if (Lookup = nil) and (length(OldPattern) = 1) then found := ByteScanIndex(pointer(S), {%H-}PStrLen(PtrUInt(S) - _STRLEN)^, byte(OldPattern[1])) + 1 else found := PosExI(OldPattern, S, 1, Lookup); // handle Lookup=nil if found = 0 then result := S else result := StringReplaceAllProcess(S, OldPattern, NewPattern, found, Lookup); end; end; function StringReplaceAll(const S, OldPattern, NewPattern: RawUtf8; CaseInsensitive: boolean): RawUtf8; begin result := StringReplaceAll(S, OldPattern, NewPattern, NORM2CASE[CaseInsensitive]); end; function StringReplaceAll(const S: RawUtf8; const OldNewPatternPairs: array of RawUtf8; CaseInsensitive: boolean): RawUtf8; var n, i: PtrInt; tab: PNormTable; begin result := S; n := high(OldNewPatternPairs); if (n <= 0) or (n and 1 <> 1) then exit; tab := NORM2CASE[CaseInsensitive]; for i := 0 to n shr 1 do result := StringReplaceAll(result, OldNewPatternPairs[i * 2], OldNewPatternPairs[i * 2 + 1], tab); end; function StringReplaceChars(const Source: RawUtf8; OldChar, NewChar: AnsiChar): RawUtf8; var i, j, n: PtrInt; P: PAnsiChar; begin if (OldChar <> NewChar) and (Source <> '') then begin n := length(Source); i := ByteScanIndex(pointer(Source), n, ord(OldChar)); if i >= 0 then begin FastSetString(result, pointer(Source), n); P := pointer(result); for j := i to n - 1 do if P[j] = OldChar then P[j] := NewChar; exit; end; end; result := Source; end; function StringReplaceTabs(const Source, TabText: RawUtf8): RawUtf8; procedure Process(S, D, T: PAnsiChar; TLen: integer); begin repeat if S^ = #0 then break else if S^ <> #9 then begin D^ := S^; inc(D); inc(S); end else begin if TLen > 0 then begin MoveByOne(T, D, TLen); inc(D, TLen); end; inc(S); end; until false; end; var L, i, n, ttl: PtrInt; begin ttl := length(TabText); L := Length(Source); n := 0; if ttl <> 0 then for i := 1 to L do if Source[i] = #9 then inc(n); if n = 0 then begin result := Source; exit; end; FastSetString(result, L + n * pred(ttl)); Process(pointer(Source), pointer(result), pointer(TabText), ttl); end; function RawUtf8OfChar(Ch: AnsiChar; Count: integer): RawUtf8; begin if Count <= 0 then FastAssignNew(result) else begin FastSetString(result, Count); FillCharFast(pointer(result)^, Count, byte(Ch)); end; end; function QuotedStr(const S: RawUtf8; Quote: AnsiChar): RawUtf8; begin QuotedStr(pointer(S), length(S), Quote, result); end; procedure QuotedStr(const S: RawUtf8; Quote: AnsiChar; var result: RawUtf8); var P: PUtf8Char; tmp: pointer; // will hold a RawUtf8 with no try..finally exception block begin tmp := nil; P := pointer(S); if (P <> nil) and (P = pointer(result)) then begin RawUtf8(tmp) := S; // make private ref-counted copy for QuotedStr(U,'"',U) P := pointer(tmp); end; QuotedStr(P, length(S), Quote, result); if tmp <> nil then {$ifdef FPC} FastAssignNew(tmp); {$else} RawUtf8(tmp) := ''; {$endif FPC} end; procedure QuotedStr(P: PUtf8Char; PLen: PtrInt; Quote: AnsiChar; var result: RawUtf8); var i, quote1, nquote: PtrInt; R: PUtf8Char; c: AnsiChar; begin nquote := 0; quote1 := ByteScanIndex(pointer(P), PLen, byte(Quote)); // asm if available if quote1 >= 0 then for i := quote1 to PLen - 1 do if P[i] = Quote then inc(nquote); FastSetString(result, PLen + nquote + 2); R := pointer(result); R^ := Quote; inc(R); if nquote = 0 then begin MoveFast(P^, R^, PLen); R[PLen] := Quote; end else begin MoveFast(P^, R^, quote1); inc(R, quote1); inc(PLen, PtrInt(PtrUInt(P))); // efficient use of registers on FPC inc(quote1, PtrInt(PtrUInt(P))); repeat if quote1 = PLen then break; c := PAnsiChar(quote1)^; inc(quote1); R^ := c; inc(R); if c <> Quote then continue; R^ := c; inc(R); until false; R^ := Quote; end; end; function GotoEndOfQuotedString(P: PUtf8Char): PUtf8Char; var quote: AnsiChar; begin // P^='"' or P^='''' at function call quote := P^; inc(P); repeat if P^ = #0 then break else if P^ <> quote then inc(P) else if P[1] = quote then // allow double quotes inside string inc(P, 2) else break; // end quote until false; result := P; end; // P^='"' or P^=#0 at function return function GotoNextNotSpace(P: PUtf8Char): PUtf8Char; begin {$ifdef FPC} while (P^ <= ' ') and (P^ <> #0) do inc(P); {$else} if P^ in [#1..' '] then repeat inc(P) until not (P^ in [#1..' ']); {$endif FPC} result := P; end; function GotoNextNotSpaceSameLine(P: PUtf8Char): PUtf8Char; begin while P^ in [#9, ' '] do inc(P); result := P; end; function GotoNextSpace(P: PUtf8Char): PUtf8Char; begin if P^ > ' ' then repeat inc(P) until P^ <= ' '; result := P; end; function NextNotSpaceCharIs(var P: PUtf8Char; ch: AnsiChar): boolean; begin while (P^ <= ' ') and (P^ <> #0) do inc(P); if P^ = ch then begin inc(P); result := true; end else result := false; end; function GotoNextSqlIdentifier(P: PUtf8Char; tab: PTextCharSet): PUtf8Char; {$ifdef HASINLINE} inline; {$endif} begin while tcCtrlNot0Comma in tab[P^] do // in [#1..' ', ';'] inc(P); if PWord(P)^ = ord('/') + ord('*') shl 8 then begin // detect and ignore e.g. '/*nocache*/' repeat inc(P); if PWord(P)^ = ord('*') + ord('/') shl 8 then begin inc(P, 2); break; end; until P^ = #0; while tcCtrlNot0Comma in tab[P^] do inc(P); end; result := P; end; function GetNextFieldProp(var P: PUtf8Char; var Prop: RawUtf8): boolean; var B: PUtf8Char; tab: PTextCharSet; begin tab := @TEXT_CHARS; P := GotoNextSqlIdentifier(P, tab); // handle /*comment*/ B := P; while tcIdentifier in tab[P^] do inc(P); // go to end of ['_', '0'..'9', 'a'..'z', 'A'..'Z'] chars FastSetString(Prop, B, P - B); P := GotoNextSqlIdentifier(P, tab); result := Prop <> ''; end; function GetNextFieldPropSameLine(var P: PUtf8Char; var Prop: ShortString): boolean; var B: PUtf8Char; tab: PTextCharSet; begin tab := @TEXT_CHARS; while tcCtrlNotLF in tab[P^] do inc(P); // ignore [#1..#9, #11, #12, #14..' '] B := P; while tcIdentifier in tab[P^] do inc(P); // go to end of field name SetString(Prop, PAnsiChar(B), P - B); while tcCtrlNotLF in TEXT_CHARS[P^] do inc(P); result := Prop <> ''; end; function UnQuoteSqlStringVar(P: PUtf8Char; out Value: RawUtf8): PUtf8Char; var quote: AnsiChar; PBeg, PS: PUtf8Char; internalquote: PtrInt; begin result := nil; if P = nil then exit; quote := P^; // " or ' inc(P); // compute unquoted string length PBeg := P; internalquote := 0; P := PosChar(P, quote); // fast SSE2 search on x86_64 if P = nil then exit; // we need at least an ending quote while true do if P^ = #0 then exit // where is my quote? else if P^ <> quote then inc(P) else if P[1] = quote then begin inc(P, 2); // allow double quotes inside string inc(internalquote); end else break; // end quote // create unquoted string if internalquote = 0 then // no quote within FastSetString(Value, PBeg, P - PBeg) else begin // unescape internal quotes pointer(Value) := FastNewString(P - PBeg - internalquote, CP_UTF8); P := PBeg; PS := pointer(Value); repeat if P[0] = quote then if P[1] = quote then // allow double quotes inside string inc(P) else // end quote break; PS^ := P[0]; inc(PS); inc(P); until false; end; result := P + 1; end; function UnQuoteSqlString(const Value: RawUtf8): RawUtf8; begin UnQuoteSqlStringVar(pointer(Value), result); end; function UnQuotedSqlSymbolName(const ExternalDBSymbol: RawUtf8): RawUtf8; begin if (ExternalDBSymbol <> '') and (ExternalDBSymbol[1] in ['[', '"', '''', '(']) then // e.g. for ZDBC's GetFields() result := copy(ExternalDBSymbol, 2, length(ExternalDBSymbol) - 2) else result := ExternalDBSymbol; end; function IdemPCharAndGetNextLine(var source: PUtf8Char; searchUp: PAnsiChar): boolean; begin if source = nil then result := false else begin result := IdemPChar(source, searchUp); source := GotoNextLine(source); end; end; function FindNameValue(P: PUtf8Char; UpperName: PAnsiChar): PUtf8Char; var table: PNormTable; // faster even on i386 u: PAnsiChar; label eof, eol; begin if (P = nil) or (UpperName = nil) then goto eof; table := @NormToUpperAnsi7; repeat if table[P^] <> UpperName^ then // first character is likely not to match repeat // quickly go to end of current line repeat eol: if P^ <= #13 then break; inc(P); until false; if (P^ = #13) or (P^ = #10) then begin repeat inc(P); until (P^ <> #10) and (P^ <> #13); if P^ = #0 then goto eof; break; // handle next line end else if P^ <> #0 then continue; // e.g. #9 eof: result := nil; // reached P^=#0 -> not found exit; until false else begin // first char did match -> try other chars inc(P); u := UpperName + 1; repeat if u^ = #0 then break else if u^ <> table[P^] then goto eol; inc(P); inc(u); until false; result := P; // if found, points just after UpperName exit; end; until false; end; function FindNameValuePointer(NameValuePairs: PUtf8Char; UpperName: PAnsiChar; out FoundLen: PtrInt; UpperNameSeparator: AnsiChar): PUtf8Char; var P: PUtf8Char; L: PtrInt; begin P := FindNameValue(NameValuePairs, UpperName); if P <> nil then repeat if UpperNameSeparator <> #0 then if P^ = UpperNameSeparator then inc(P) // e.g. THttpSocket.HeaderGetValue uses UpperNameSeparator=':' else break; while P^ in [#9, ' '] do // trim left inc(P); L := 0; while P[L] > #13 do // end of line/value inc(L); while P[L - 1] = ' ' do // trim right dec(L); FoundLen := L; break; until false; result := P; end; function FindNameValue(const NameValuePairs: RawUtf8; UpperName: PAnsiChar; var Value: RawUtf8; KeepNotFoundValue: boolean; UpperNameSeparator: AnsiChar): boolean; var P: PUtf8Char; L: PtrInt; begin P := FindNameValuePointer(pointer(NameValuePairs), UpperName, L, UpperNameSeparator); if P <> nil then begin FastSetString(Value, P, L); result := true; end else begin if not KeepNotFoundValue then {$ifdef FPC} FastAssignNew(Value); {$else} Value := ''; {$endif FPC} result := false; end; end; function GetLineSize(P, PEnd: PUtf8Char): PtrUInt; var c: byte; begin {$ifdef CPUX64} if PEnd <> nil then begin result := BufferLineLength(P, PEnd); // use branchless SSE2 on x86_64 exit; end; result := PtrUInt(P) - 1; {$else} result := PtrUInt(P) - 1; if PEnd <> nil then repeat // inlined BufferLineLength() inc(result); if PtrUInt(result) < PtrUInt(PEnd) then begin c := PByte(result)^; if (c > 13) or ((c <> 10) and (c <> 13)) then continue; end; break; until false else {$endif CPUX64} repeat // inlined BufferLineLength() ending at #0 for PEnd=nil inc(result); c := PByte(result)^; if (c > 13) or ((c <> 0) and (c <> 10) and (c <> 13)) then continue; break; until false; dec(result, PtrUInt(P)); // returns length end; function GetLineSizeSmallerThan(P, PEnd: PUtf8Char; aMinimalCount: integer): boolean; begin result := false; if P <> nil then while (P < PEnd) and (P^ <> #10) and (P^ <> #13) do if aMinimalCount = 0 then exit else begin dec(aMinimalCount); inc(P); end; result := true; end; {$ifndef PUREMORMOT2} function GetNextStringLineToRawUnicode(var P: PChar): RawUnicode; var S: PChar; begin if P = nil then result := '' else begin S := P; while S^ >= ' ' do inc(S); result := StringToRawUnicode(P, S - P); while (S^ <> #0) and (S^ < ' ') do inc(S); // ignore e.g. #13 or #10 if S^ <> #0 then P := S else P := nil; end; end; {$endif PUREMORMOT2} function TrimLeftLowerCase(const V: RawUtf8): PUtf8Char; begin result := Pointer(V); if result <> nil then begin while result^ in ['a'..'z'] do inc(result); if result^ = #0 then result := Pointer(V); end; end; function TrimLeftLowerCaseToShort(V: PShortString): ShortString; begin TrimLeftLowerCaseToShort(V, result); end; procedure TrimLeftLowerCaseToShort(V: PShortString; out result: ShortString); var P: PAnsiChar; L: integer; begin L := length(V^); P := @V^[1]; while (L > 0) and (P^ in ['a'..'z']) do begin inc(P); dec(L); end; if L = 0 then result := V^ else SetString(result, P, L); end; function TrimLeftLowerCaseShort(V: PShortString): RawUtf8; var P: PAnsiChar; L: integer; begin L := length(V^); P := @V^[1]; while (L > 0) and (P^ in ['a'..'z']) do begin inc(P); dec(L); end; if L = 0 then FastSetString(result, @V^[1], length(V^)) else FastSetString(result, P, L); end; procedure AppendShortComma(text: PAnsiChar; len: PtrInt; var result: ShortString; trimlowercase: boolean); begin if trimlowercase then while text^ in ['a'..'z'] do if len = 1 then exit else begin inc(text); dec(len); end; if integer(ord(result[0])) + len >= 255 then exit; if len > 0 then MoveByOne(text, @result[ord(result[0]) + 1], len); inc(result[0], len + 1); result[ord(result[0])] := ','; end; function IdemPropNameUSmallNotVoid(P1, P2, P1P2Len: PtrInt): boolean; {$ifdef HASINLINE}inline;{$endif} begin inc(P1P2Len, P1); dec(P2, P1); repeat result := (PByte(P1)^ xor ord(PAnsiChar(P1)[P2])) and $df = 0; if not result then exit; inc(P1); until P1 >= P1P2Len; end; function FindShortStringListExact(List: PShortString; MaxValue: integer; aValue: PUtf8Char; aValueLen: PtrInt): integer; var PLen: PtrInt; begin if aValueLen <> 0 then for result := 0 to MaxValue do begin PLen := PByte(List)^; if (PLen = aValueLen) and IdemPropNameUSmallNotVoid(PtrInt(@List^[1]), PtrInt(aValue), PLen) then exit; List := pointer(@PAnsiChar(PLen)[PtrUInt(List) + 1]); // next end; result := -1; end; function FindShortStringListTrimLowerCase(List: PShortString; MaxValue: integer; aValue: PUtf8Char; aValueLen: PtrInt): integer; var PLen: PtrInt; begin if aValueLen <> 0 then for result := 0 to MaxValue do begin PLen := ord(List^[0]); inc(PUtf8Char(List)); repeat // trim lower case if not (PUtf8Char(List)^ in ['a'..'z']) then break; inc(PUtf8Char(List)); dec(PLen); until PLen = 0; if (PLen = aValueLen) and IdemPropNameUSmallNotVoid(PtrInt(aValue), PtrInt(List), PLen) then exit; inc(PUtf8Char(List), PLen); // next end; result := -1; end; function FindShortStringListTrimLowerCaseExact(List: PShortString; MaxValue: integer; aValue: PUtf8Char; aValueLen: PtrInt): integer; var PLen: PtrInt; begin if aValueLen <> 0 then for result := 0 to MaxValue do begin PLen := ord(List^[0]); inc(PUtf8Char(List)); repeat if not (PUtf8Char(List)^ in ['a'..'z']) then break; inc(PUtf8Char(List)); dec(PLen); until PLen = 0; if (PLen = aValueLen) and CompareMemSmall(aValue, List, PLen) then exit; inc(PUtf8Char(List), PLen); end; result := -1; end; function UnCamelCase(const S: RawUtf8): RawUtf8; var tmp: TSynTempBuffer; destlen: PtrInt; begin if S = '' then result := '' else begin destlen := UnCamelCase(tmp.Init(length(S) * 2), pointer(S)); tmp.Done(PAnsiChar(tmp.buf) + destlen, result); end; end; function UnCamelCase(D, P: PUtf8Char): integer; var Space, SpaceBeg, DBeg: PUtf8Char; CapitalCount: integer; Number: boolean; label Next; begin DBeg := D; if (D <> nil) and (P <> nil) then begin // avoid GPF Space := D; SpaceBeg := D; repeat CapitalCount := 0; Number := P^ in ['0'..'9']; if Number then repeat inc(CapitalCount); D^ := P^; inc(P); inc(D); until not (P^ in ['0'..'9']) else repeat inc(CapitalCount); D^ := P^; inc(P); inc(D); until not (P^ in ['A'..'Z']); if P^ = #0 then break; // no lowercase conversion of last fully uppercased word if (CapitalCount > 1) and not Number then begin dec(P); dec(D); end; while P^ in ['a'..'z'] do begin D^ := P^; inc(D); inc(P); end; if P^ = '_' then if P[1] = '_' then begin D^ := ':'; inc(P); inc(D); goto Next; end else begin PWord(D)^ := ord(' ') + ord('-') shl 8; inc(D, 2); Next: if Space = SpaceBeg then SpaceBeg := D + 1; inc(P); Space := D + 1; end else Space := D; if P^ = #0 then break; D^ := ' '; inc(D); until false; if Space > DBeg then dec(Space); while Space > SpaceBeg do begin if Space^ in ['A'..'Z'] then if not (Space[1] in ['A'..'Z', ' ']) then inc(Space^, 32); // lowercase conversion of not last fully uppercased word dec(Space); end; end; result := D - DBeg; end; procedure CamelCase(P: PAnsiChar; len: PtrInt; var s: RawUtf8; const isWord: TSynByteSet); var i: PtrInt; d: PAnsiChar; tmp: array[byte] of AnsiChar; begin if len > SizeOf(tmp) then len := SizeOf(tmp); for i := 0 to len - 1 do if not (ord(P[i]) in isWord) then begin if i > 0 then begin MoveFast(P^, tmp, i); inc(P, i); dec(len, i); end; d := @tmp[i]; while len > 0 do begin while (len > 0) and not (ord(P^) in isWord) do begin inc(P); dec(len); end; if len = 0 then break; d^ := NormToUpperAnsi7[P^]; inc(d); repeat inc(P); dec(len); if not (ord(P^) in isWord) then break; d^ := P^; inc(d); until len = 0; end; P := @tmp; len := d - tmp; break; end; FastSetString(s, P, len); end; procedure CamelCase(const text: RawUtf8; var s: RawUtf8; const isWord: TSynByteSet); begin CamelCase(pointer(text), length(text), s, isWord); end; procedure GetCaptionFromPCharLen(P: PUtf8Char; out result: string); var Temp: array[byte] of AnsiChar; begin if P = nil then exit; {$ifdef UNICODE} Utf8DecodeToUnicodeString(Temp, UnCamelCase(@Temp, P), result); {$else} SetString(result, PAnsiChar(@Temp), UnCamelCase(@Temp, P)); {$endif UNICODE} if Assigned(LoadResStringTranslate) then LoadResStringTranslate(result); end; { ************ TRawUtf8DynArray Processing Functions } function IsZero(const Values: TRawUtf8DynArray): boolean; var i: PtrInt; begin result := false; for i := 0 to length(Values) - 1 do if Values[i] <> '' then exit; result := true; end; function TRawUtf8DynArrayFrom(const Values: array of RawUtf8): TRawUtf8DynArray; var i: PtrInt; begin Finalize(result); SetLength(result, length(Values)); for i := 0 to high(Values) do result[i] := Values[i]; end; function FindRawUtf8(Values: PRawUtf8; const Value: RawUtf8; ValuesCount: integer; CaseSensitive: boolean): integer; var ValueLen: TStrLen; begin dec(ValuesCount); ValueLen := length(Value); if ValueLen = 0 then for result := 0 to ValuesCount do if Values^ = '' then exit else inc(Values) else if CaseSensitive then for result := 0 to ValuesCount do if (PtrUInt(Values^) <> 0) and ({%H-}PStrLen(PtrUInt(Values^) - _STRLEN)^ = ValueLen) and CompareMemFixed(pointer(PtrInt(Values^)), pointer(Value), ValueLen) then exit else inc(Values) else for result := 0 to ValuesCount do if (PtrUInt(Values^) <> 0) and // StrIComp() won't change length ({%H-}PStrLen(PtrUInt(Values^) - _STRLEN)^ = ValueLen) and (StrIComp(pointer(Values^), pointer(Value)) = 0) then exit else inc(Values); result := -1; end; function FindRawUtf8(const Values: TRawUtf8DynArray; const Value: RawUtf8; CaseSensitive: boolean): integer; begin result := FindRawUtf8(pointer(Values), Value, length(Values), CaseSensitive); end; function FindRawUtf8(const Values: array of RawUtf8; const Value: RawUtf8; CaseSensitive: boolean): integer; begin result := high(Values); if result >= 0 then result := FindRawUtf8(@Values[0], Value, result + 1, CaseSensitive); end; function AddRawUtf8(var Values: TRawUtf8DynArray; const Value: RawUtf8; NoDuplicates, CaseSensitive: boolean): boolean; var i: PtrInt; begin if NoDuplicates then begin i := FindRawUtf8(Values, Value, CaseSensitive); if i >= 0 then begin result := false; exit; end; end; i := length(Values); SetLength(Values, i + 1); Values[i] := Value; result := true; end; function AddRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer; const Value: RawUtf8): PtrInt; begin result := ValuesCount; if result = Length(Values) then SetLength(Values, NextGrow(result)); Values[result] := Value; inc(ValuesCount); end; procedure AddRawUtf8(var Values: TRawUtf8DynArray; const Value: TRawUtf8DynArray); var n, o, i: PtrInt; begin n := length(Value); if n = 0 then exit; o := length(Values); SetLength(Values, o + n); for i := 0 to n - 1 do Values[o + i] := Value[i]; end; procedure AddRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer; const Value: TRawUtf8DynArray); var n, o, i: PtrInt; begin n := length(Value); o := ValuesCount; inc(ValuesCount, n); if ValuesCount > Length(Values) then SetLength(Values, NextGrow(ValuesCount)); for i := 0 to n - 1 do Values[o + i] := Value[i]; end; function RawUtf8DynArrayEquals(const A, B: TRawUtf8DynArray): boolean; var n, i: PtrInt; begin result := false; n := length(A); if n <> length(B) then exit; for i := 0 to n - 1 do if A[i] <> B[i] then exit; result := true; end; function RawUtf8DynArrayEquals(const A, B: TRawUtf8DynArray; Count: integer): boolean; var i: PtrInt; begin result := false; for i := 0 to Count - 1 do if A[i] <> B[i] then exit; result := true; end; function AddString(var Values: TStringDynArray; const Value: string): PtrInt; begin result := length(Values); SetLength(Values, result + 1); Values[result] := Value; end; procedure StringDynArrayToRawUtf8DynArray(const Source: TStringDynArray; var Result: TRawUtf8DynArray); var i: PtrInt; begin Finalize(Result); SetLength(Result, length(Source)); for i := 0 to length(Source) - 1 do StringToUtf8(Source[i], Result[i]); end; procedure StringListToRawUtf8DynArray(Source: TStringList; var Result: TRawUtf8DynArray); var i: PtrInt; begin Finalize(Result); SetLength(Result, Source.Count); for i := 0 to Source.Count - 1 do StringToUtf8(Source[i], Result[i]); end; function FastLocatePUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char): PtrInt; begin result := FastLocatePUtf8CharSorted(P, R, Value, TUtf8Compare(@StrComp)); end; function FastLocatePUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char; Compare: TUtf8Compare): PtrInt; var L, i, cmp: PtrInt; begin // fast O(log(n)) binary search if (not Assigned(Compare)) or (R < 0) then result := 0 else if Compare(P^[R], Value) < 0 then // quick return if already sorted result := R + 1 else begin L := 0; result := -1; // return -1 if found repeat {$ifdef CPUX64} i := L + R; i := i shr 1; {$else} i := (L + R) shr 1; {$endif CPUX64} cmp := Compare(P^[i], Value); if cmp = 0 then exit; if cmp < 0 then L := i + 1 else R := i - 1; until L > R; while (i >= 0) and (Compare(P^[i], Value) >= 0) do dec(i); result := i + 1; // return the index where to insert end; end; function FastFindPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char; Compare: TUtf8Compare): PtrInt; var L, cmp: PtrInt; begin // fast O(log(n)) binary search L := 0; if Assigned(Compare) and (R >= 0) then repeat {$ifdef CPUX64} result := L + R; result := result shr 1; {$else} result := (L + R) shr 1; {$endif CPUX64} cmp := Compare(P^[result], Value); if cmp = 0 then exit; if cmp < 0 then begin L := result + 1; if L <= R then continue; break; end; R := result - 1; if L <= R then continue; break; until false; result := -1; end; {$ifdef CPUX64} function FastFindPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char): PtrInt; {$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif} {$ifdef win64} // P=rcx/rdi R=rdx/rsi Value=r8/rdx push rdi mov rdi, P // P=rdi {$endif win64} push r12 push r13 xor r9, r9 // L=r9 test R, R jl @err test Value, Value jz @void mov cl, byte ptr [Value] // to check first char (likely diverse) @s: lea rax, qword ptr [r9 + R] shr rax, 1 lea r12, qword ptr [rax - 1] // branchless main loop lea r13, qword ptr [rax + 1] mov r10, qword ptr [rdi + rax * 8] test r10, r10 jz @lt cmp cl, byte ptr [r10] je @eq cmovc R, r12 cmovnc r9, r13 @nxt: cmp r9, R jle @s @err: mov rax, -1 @found: pop r13 pop r12 {$ifdef win64} pop rdi {$endif win64} ret @lt: mov r9, r13 // very unlikely P[rax]=nil jmp @nxt @eq: mov r11, Value // first char equal -> check others @sub: mov cl, byte ptr [r10] add r10, 1 add r11, 1 test cl, cl jz @found mov cl, byte ptr [r11] cmp cl, byte ptr [r10] je @sub mov cl, byte ptr [Value] // reset first char cmovc R, r12 cmovnc r9, r13 cmp r9, R jle @s jmp @err @void: mov rax, -1 cmp qword ptr [P], 0 cmove rax, Value jmp @found end; {$else} function FastFindPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char): PtrInt; var L: PtrInt; c: byte; piv, val: PByte; begin // fast O(log(n)) binary search using inlined StrCompFast() if R >= 0 then if Value <> nil then begin L := 0; repeat result := L + R; result := result shr 1; piv := pointer(P^[result]); if piv <> nil then begin val := pointer(Value); c := piv^; if c = val^ then repeat if c = 0 then exit; // StrComp(P^[result],Value)=0 inc(piv); inc(val); c := piv^; until c <> val^; if c > val^ then begin R := result - 1; // StrComp(P^[result],Value)>0 if L <= R then continue; break; end; end; L := result + 1; // StrComp(P^[result],Value)<0 if L <= R then continue; break; until false; end else if P^[0] = nil then begin // '' should be in lowest P[] slot result := 0; exit; end; result := -1; end; {$endif CPUX64} function FastFindUpperPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char; ValueLen: PtrInt): PtrInt; var tmp: array[byte] of AnsiChar; begin UpperCopy255Buf(@tmp, Value, ValueLen)^ := #0; result := FastFindPUtf8CharSorted(P, R, @tmp); end; function FastFindIndexedPUtf8Char(P: PPUtf8CharArray; R: PtrInt; var SortedIndexes: TCardinalDynArray; Value: PUtf8Char; ItemComp: TUtf8Compare): PtrInt; var L, cmp: PtrInt; begin // fast O(log(n)) binary search L := 0; if 0 <= R then repeat {$ifdef CPUX64} result := L + R; result := result shr 1; {$else} result := (L + R) shr 1; {$endif CPUX64} cmp := ItemComp(P^[SortedIndexes[result]], Value); if cmp = 0 then begin result := SortedIndexes[result]; exit; end; if cmp < 0 then begin L := result + 1; if L <= R then continue; break; end; R := result - 1; if L <= R then continue; break; until false; result := -1; end; function AddSortedRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer; const Value: RawUtf8; CoValues: PIntegerDynArray; ForcedIndex: PtrInt; Compare: TUtf8Compare): PtrInt; var n: PtrInt; begin if ForcedIndex >= 0 then result := ForcedIndex else begin if not Assigned(Compare) then Compare := @StrComp; result := FastLocatePUtf8CharSorted(pointer(Values), ValuesCount - 1, pointer(Value), Compare); if result < 0 then exit; // Value exists -> fails end; n := Length(Values); if ValuesCount = n then begin n := NextGrow(n); SetLength(Values, n); if CoValues <> nil then SetLength(CoValues^, n); end; n := ValuesCount; if result < n then begin n := (n - result) * SizeOf(pointer); MoveFast(Pointer(Values[result]), Pointer(Values[result + 1]), n); PtrInt(Values[result]) := 0; // avoid GPF if CoValues <> nil then begin {$ifdef CPU64} n := n shr 1; {$endif} // 64-bit pointer to 32-bit integer MoveFast(CoValues^[result], CoValues^[result + 1], n); end; end else result := n; Values[result] := Value; inc(ValuesCount); end; type /// used internally for faster quick sort {$ifdef USERECORDWITHMETHODS} TQuickSortRawUtf8 = record {$else} TQuickSortRawUtf8 = object {$endif USERECORDWITHMETHODS} public Compare: TUtf8Compare; CoValues: PIntegerArray; pivot: pointer; procedure Sort(Values: PPointerArray; L, R: PtrInt); end; procedure TQuickSortRawUtf8.Sort(Values: PPointerArray; L, R: PtrInt); var I, J, P: PtrInt; tmp: Pointer; int: integer; begin if L < R then repeat I := L; J := R; P := (L + R) shr 1; repeat pivot := Values^[P]; while Compare(Values^[I], pivot) < 0 do inc(I); while Compare(Values^[J], pivot) > 0 do dec(J); if I <= J then begin tmp := Values^[J]; Values^[J] := Values^[I]; Values^[I] := tmp; if CoValues <> nil then begin int := CoValues^[J]; CoValues^[J] := CoValues^[I]; CoValues^[I] := int; end; if P = I then P := J else if P = J then P := I; inc(I); dec(J); end; until I > J; if J - L < R - I then begin // use recursion only for smaller range if L < J then Sort(Values, L, J); L := I; end else begin if I < R then Sort(Values, I, R); R := J; end; until L >= R; end; procedure QuickSortRawUtf8(var Values: TRawUtf8DynArray; ValuesCount: integer; CoValues: PIntegerDynArray; Compare: TUtf8Compare); var QS: TQuickSortRawUtf8; begin if Assigned(Compare) then QS.Compare := Compare else QS.Compare := @StrComp; if CoValues = nil then QS.CoValues := nil else QS.CoValues := pointer(CoValues^); QS.Sort(pointer(Values), 0, ValuesCount - 1); end; procedure QuickSortRawUtf8(Values: PRawUtf8Array; L, R: PtrInt; caseInsensitive: boolean); var QS: TQuickSortRawUtf8; begin QS.Compare := StrCompByCase[caseInsensitive]; QS.CoValues := nil; QS.Sort(pointer(Values), L, R); end; procedure MakeUniqueArray(var Values: TRawUtf8DynArray); begin Values := copy(Values); // sub-proc to avoid try..finally end; function DeleteRawUtf8(var Values: TRawUtf8DynArray; Index: PtrInt): boolean; var n: PtrInt; begin n := length(Values); if PtrUInt(Index) >= PtrUInt(n) then result := false else begin dec(n); if PDACnt(PAnsiChar(pointer(Values)) - _DACNT)^ > 1 then MakeUniqueArray(Values); Values[Index] := ''; // avoid GPF if n > Index then begin MoveFast(pointer(Values[Index + 1]), pointer(Values[Index]), (n - Index) * SizeOf(pointer)); PtrUInt(Values[n]) := 0; // avoid GPF end; SetLength(Values, n); result := true; end; end; function DeleteRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer; Index: integer; CoValues: PIntegerDynArray): boolean; var n: integer; begin n := ValuesCount; if cardinal(Index) >= cardinal(n) then result := false else begin dec(n); ValuesCount := n; if PDACnt(PAnsiChar(pointer(Values)) - _DACNT)^ > 1 then MakeUniqueArray(Values); Values[Index] := ''; // avoid GPF dec(n, Index); if n > 0 then begin if CoValues <> nil then MoveFast(CoValues^[Index + 1], CoValues^[Index], n * SizeOf(integer)); MoveFast(pointer(Values[Index + 1]), pointer(Values[Index]), n * SizeOf(pointer)); PtrUInt(Values[ValuesCount]) := 0; // avoid GPF end; result := true; end; end; {$ifdef OSPOSIX} { TPosixFileCaseInsensitive } constructor TPosixFileCaseInsensitive.Create( const aFolder: TFileName; aSubFolders: boolean); begin fFolder := aFolder; fSubFolders := aSubFolders; fFlushSeconds := 60; end; procedure TPosixFileCaseInsensitive.SetFolder(const aFolder: TFileName); begin if self = nil then exit; fSafe.WriteLock; try fFiles := nil; // force list refresh fFolder := aFolder; finally fSafe.WriteUnLock; end; end; procedure TPosixFileCaseInsensitive.SetSubFolders(aSubFolders: boolean); begin if (self = nil) or (fSubFolders = aSubFolders) then exit; fSubFolders := aSubFolders; Flush; end; procedure TPosixFileCaseInsensitive.OnIdle(tix64: Int64); begin if (self = nil) or (fFiles = nil) or (fFlushSeconds = 0) then exit; if tix64 = 0 then tix64 := GetTickCount64; if tix64 shr 10 < fNextTix then exit; fSafe.WriteLock; try fFiles := nil; // force list refresh finally fSafe.WriteUnLock; end; end; procedure TPosixFileCaseInsensitive.Flush; begin SetFolder(fFolder); end; function TPosixFileCaseInsensitive.Find(const aSearched: TFileName; aReadMs: PInteger): TFileName; var start, stop: Int64; i: PtrInt; fn: RawUtf8; begin result := ''; if aReadMs <> nil then aReadMs^ := 0; if (self = nil) or (fFolder = '') or (aSearched = '') then exit; if fFiles = nil then // need to refresh the cache begin fSafe.WriteLock; try if fFiles = nil then begin if aReadMs <> nil then QueryPerformanceMicroSeconds(start); fFiles := PosixFileNames(fFolder, fSubFolders); // fast syscall QuickSortRawUtf8(fFiles, length(fFiles), nil, @StrIComp); // e.g. 4392 filenames from /home/ab/dev/lib/ in 7.20ms if aReadMs <> nil then begin QueryPerformanceMicroSeconds(stop); aReadMs^ := stop - start; end; if fFlushSeconds <> 0 then fNextTix := (GetTickCount64 shr 10) + fFlushSeconds; end; finally fSafe.WriteUnLock; end; end; StringToUtf8(aSearched, fn); fSafe.ReadLock; // non-blocking lookup try i := FastFindPUtf8CharSorted( // efficient O(log(n)) binary search pointer(fFiles), high(fFiles), pointer(fn), @StrIComp); if i >= 0 then Utf8ToFileName(fFiles[i], result); // use exact file name case from OS finally fSafe.ReadUnLock; end; end; function TPosixFileCaseInsensitive.Count: PtrInt; begin if self = nil then result := 0 else result := length(fFiles); end; function TPosixFileCaseInsensitive.Files: TRawUtf8DynArray; begin result := nil; if (self = nil) or (fFiles = nil) then exit; fSafe.ReadLock; try result := copy(fFiles); // make a copy for thread safety finally fSafe.ReadUnLock; end; end; {$endif OSPOSIX} { ************** Operating-System Independent Unicode Process } // freely inspired by Bero's PUCU library, released under zlib license // https://github.com/BeRo1985/pucu (C)2016-2020 Benjamin Rosseaux {$define UU_COMPRESSED} // 1KB compressed static table in the exe renders into our 20KB UU[] array :) type // 20,016 bytes for full Unicode 10.0 case folding branchless conversion {$ifdef USERECORDWITHMETHODS} TUnicodeUpperTable = record {$else} TUnicodeUpperTable = object {$endif USERECORDWITHMETHODS} public Block: array[0..37, 0..127] of integer; IndexHi: array[0..271] of byte; IndexLo: array[0..8, 0..31] of byte; // branchless Unicode 10.0 uppercase folding using our internal tables // caller should have checked that c <= UU_MAX function Ucs4Upper(c: PtrUInt): PtrUInt; {$ifdef HASINLINE} inline; {$endif} end; {$ifndef CPUX86NOTPIC} PUnicodeUpperTable = ^TUnicodeUpperTable; {$endif CPUX86NOTPIC} const UU_BLOCK_HI = 7; UU_BLOCK_LO = 127; UU_INDEX_HI = 5; UU_INDEX_LO = 31; UU_MAX = $10ffff; var {$ifdef UU_COMPRESSED} UU: TUnicodeUpperTable; {$else} UU: TUnicodeUpperTable = ( Block: ( (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 743, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -32, -32, - 32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121), (0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -232, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, -300), (195, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 0, 0, 97, 0, 0, 0, -1, 163, 0, 0, 0, 130, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 56, 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, -1, 0, -1, 0, -1, 0, - 1, 0, -1, 0, -1, 0, -1, -79, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1), (0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 10815, 10815, 0, -1, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 10783, 10780, 10782, -210, -206, 0, -205, -205, 0, -202, 0, -203, 42319, 0, 0, 0, -205, 42315, 0, -207, 0, 42280, 42308, 0, -209, -211, 42308, 10743, 42305, 0, 0, -211, 0, 10749, -213, 0, 0, -214, 0, 0, 0, 0, 0, 0, 0, 10727, 0, 0), (-218, 0, 0, -218, 0, 0, 0, 42282, -218, -69, -217, -217, -71, 0, 0, 0, 0, 0, -219, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42261, 42258, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, 130, 130, 130, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38, -37, -37, -37, 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -31, -32, -32, -32, -32, -32, -32, -32, -32, -32, -64, -63, -63, 0, -62, -57, 0, 0, 0, -47, -54, -8, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -86, -80, 7, -116, 0, -96, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1), (0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -15, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1), (0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48), (-48, -48, -48, -48, -48, -48, -48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0), (-6254, -6253, -6244, -6242, -6242, -6243, -6236, -6181, 35266, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 35332, 0, 0, 0, 3814, 0, 0), (0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1), (0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, -59, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1), (8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 86, 86, 86, 86, 100, 100, 128, 128, 112, 112, 126, 126, 0, 0), (8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -7205, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -28, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16), (0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, 0, 0, -1, 0, 0, 0, -10795, -10792, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (-7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, 0, -7264, 0, 0, 0, 0, 0, -7264, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, -1), (0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -928, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864), (-38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) ); IndexHi: (0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 5, 6, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3); IndexLo: ((0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), (12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 14, 15, 12, 16, 17, 18, 19), (12, 12, 20, 21, 12, 12, 12, 12, 12, 22, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 23, 24, 25, 12, 12, 12, 12, 12), (12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), (12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 26, 27, 28, 29, 12, 12, 12, 12, 12, 12, 30, 31, 12, 12, 12, 12, 12, 12, 12, 12), (12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 32, 12), (12, 12, 12, 12, 12, 12, 12, 12, 33, 34, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 35, 12, 12, 12, 12, 12, 12), (12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), (12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 37, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12)); ); {$endif UU_COMPRESSED} function TUnicodeUpperTable.Ucs4Upper(c: PtrUInt): PtrUInt; var i: PtrUInt; begin // branchless conversion of 0..UU_MAX = $10ffff Unicode codepoints i := c shr UU_BLOCK_HI; result := PtrInt(c) + Block[IndexLo[IndexHi[i shr UU_INDEX_HI], i and UU_INDEX_LO], c and UU_BLOCK_LO]; end; function Utf8UpperReference(S, D: PUtf8Char): PUtf8Char; var c: PtrUInt; S2: PUtf8Char; {$ifdef CPUX86NOTPIC} tab: TUnicodeUpperTable absolute UU; {$else} tab: PUnicodeUpperTable; {$endif CPUX86NOTPIC} begin {$ifndef CPUX86NOTPIC} tab := @UU; {$endif CPUX86NOTPIC} if S <> nil then repeat c := ord(S^); if c <= 127 then if c = 0 then break else begin inc(c, tab.Block[0, c]); // branchless a..z -> A..Z D^ := AnsiChar(c); inc(S); inc(D); continue; end else if c and $20 = 0 then begin c := (c shl 6) + byte(S[1]) - $3080; // fast process $0..$7ff inc(S, 2); end else begin S2 := S; c := UTF8_TABLE.GetHighUtf8Ucs4(S2); // handle even surrogates S := S2; if c = 0 then c := UNICODE_REPLACEMENT_CHARACTER; // =$fffd for invalid input end; if c <= UU_MAX then c := tab.Ucs4Upper(c); inc(D, Ucs4ToUtf8(c, D)); until false; D^ := #0; result := D; end; function Utf8UpperReference(S, D: PUtf8Char; SLen: PtrUInt): PUtf8Char; var c: PtrUInt; endSBy4: PUtf8Char; extra, i: PtrInt; {$ifdef CPUX86NOTPIC} tab: TUnicodeUpperTable absolute UU; utf8: TUtf8Table absolute UTF8_TABLE; {$else} tab: PUnicodeUpperTable; utf8: PUtf8Table; {$endif CPUX86NOTPIC} label by1, by4; // ugly but faster begin if (S <> nil) and (D <> nil) then begin {$ifndef CPUX86NOTPIC} tab := @UU; utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} // first handle trailing 7-bit ASCII chars, by quad inc(SLen, PtrUInt(S)); endSBy4 := PUtf8Char(SLen) - 4; if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(S) and 3 = 0) and {$endif} (S <= endSBy4) then repeat if PCardinal(S)^ and $80808080 <> 0 then goto by1; // break on first non ASCII quad by4: i := byte(S[0]); inc(i, tab.Block[0, i]); // branchless a..z -> A..Z D[0] := AnsiChar(i); i := byte(S[1]); inc(i, tab.Block[0, i]); D[1] := AnsiChar(i); i := byte(S[2]); inc(i, tab.Block[0, i]); D[2] := AnsiChar(i); i := byte(S[3]); inc(i, tab.Block[0, i]); D[3] := AnsiChar(i); inc(S, 4); inc(D, 4); until S > endSBy4; // generic loop, handling one UCS4 CodePoint per iteration if S < PUtf8Char(SLen) then repeat by1: c := byte(S^); inc(S); if c <= 127 then begin inc(c, tab.Block[0, c]); // branchless a..z -> A..Z D^ := AnsiChar(c); inc(D); if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(S) and 3 = 0) and{$endif} (S <= endSBy4) then if PCardinal(S)^ and $80808080 = 0 then goto By4 else continue else if S < PUtf8Char(SLen) then continue else break; end else begin extra := utf8.Lookup[c]; if (extra = UTF8_INVALID) or (S + extra > PUtf8Char(SLen)) then break; i := 0; repeat c := (c shl 6) + byte(S[i]); inc(i) until i = extra; inc(S, extra); with utf8.Extra[extra] do begin dec(c, offset); if c < minimum then break; // invalid input content end; if c <= UU_MAX then c := tab.Ucs4Upper(c); inc(D, Ucs4ToUtf8(c, D)); if S < PUtf8Char(SLen) then continue else break; end; until false; D^ := #0; end; result := D; end; function UpperCaseReference(const S: RawUtf8): RawUtf8; var len: integer; tmp: TSynTempBuffer; begin len := length(S); tmp.Init(len * 2); // some codepoints enhance in length tmp.Done(Utf8UpperReference(pointer(S), tmp.buf, len), result); end; function UpperCaseUcs4Reference(const S: RawUtf8): RawUcs4; var c, n: PtrUInt; U: PUtf8Char; begin if S = '' then begin result := nil; exit; end; SetLength(result, length(S) + 1); U := pointer(S); n := 0; repeat c := NextUtf8Ucs4(U); if c = 0 then break; if c <= UU_MAX then c := UU.Ucs4Upper(c); result[n] := c; inc(n); until false; if n = 0 then result := nil else begin result[n] := 0; // always end with a 0 DynArrayFakeLength(result, n); // faster than SetLength() end; end; function Utf8ICompReference(u1, u2: PUtf8Char): PtrInt; var c2: PtrInt; {$ifdef CPUX86NOTPIC} tab: TUnicodeUpperTable absolute UU; {$else} tab: PUnicodeUpperTable; {$endif CPUX86NOTPIC} label c2low; begin {$ifndef CPUX86NOTPIC} tab := @UU; {$endif CPUX86NOTPIC} if u1 <> u2 then if u1 <> nil then if u2 <> nil then repeat result := ord(u1^); c2 := ord(u2^); if result <= 127 then if result <> 0 then begin inc(u1); inc(result, tab.Block[0, result]); // branchless a..z -> A..Z if c2 <= 127 then begin c2low: if c2 = 0 then exit; // u1>u2 -> return u1^ inc(u2); inc(c2, tab.Block[0, c2]); dec(result, c2); if result <> 0 then exit; continue; end; end else begin // result=u1^=#0 -> end of u1 reached if c2 <> 0 then // end of u2 reached -> u1=u2 -> return 0 result := -1; // u1 0 then exit; until false else result := 1 // u2='' else result := -1 // u1='' else result := 0; // u1=u2 end; function Utf8ILCompReference(u1, u2: PUtf8Char; L1, L2: integer): PtrInt; var c2: PtrInt; extra, i: integer; {$ifdef CPUX86NOTPIC} tab: TUnicodeUpperTable absolute UU; utf8: TUtf8Table absolute UTF8_TABLE; {$else} tab: PUnicodeUpperTable; utf8: PUtf8Table; {$endif CPUX86NOTPIC} label neg, pos; begin {$ifndef CPUX86NOTPIC} tab := @UU; utf8 := @UTF8_TABLE; {$endif CPUX86NOTPIC} if u1 <> u2 then if (u1 <> nil) and (L1 <> 0) then if (u2 <> nil) and (L2 <> 0) then repeat result := ord(u1^); c2 := ord(u2^); inc(u1); dec(L1); if result <= 127 then begin inc(result, tab.Block[0, result]); // branchless a..z -> A..Z if c2 <= 127 then begin inc(c2, tab.Block[0, c2]); dec(L2); inc(u2); dec(result, c2); if result <> 0 then // found unmatching char exit else if L1 <> 0 then if L2 <> 0 then // L1>0 and L2>0 -> next char continue else // L1>0 and L2=0 -> u1>u2 goto pos else if L2 <> 0 then // L1=0 and L2>0 -> u1 u1=u2 -> returns 0 exit; end; end else begin // fast Unicode 10.0 uppercase conversion extra := utf8.Lookup[result]; if extra = UTF8_INVALID then // invalid leading byte goto neg; dec(L1, extra); if L1 < 0 then goto neg; i := 0; repeat result := result shl 6; inc(result, ord(u1[i])); inc(i); until i = extra; inc(u1, extra); dec(result, utf8.Extra[extra].offset); if PtrUInt(result) <= UU_MAX then result := tab.Ucs4Upper(result); end; // here result=NormToUpper[u1^] inc(u2); dec(L2); if c2 <= 127 then begin inc(c2, tab.Block[0, c2]); dec(result, c2); if result <> 0 then // found unmatching codepoint exit; end else begin extra := utf8.Lookup[c2]; if extra = UTF8_INVALID then goto pos; dec(L2, extra); if L2 < 0 then goto pos; i := 0; repeat c2 := c2 shl 6; inc(c2, ord(u2[i])); inc(i); until i = extra; inc(u2, extra); dec(c2, utf8.Extra[extra].offset); if PtrUInt(c2) <= UU_MAX then c2 := tab.Ucs4Upper(c2); dec(result, c2); if result <> 0 then // found unmatching codepoint exit; end; // here we have result=0 if L1 = 0 then // test if we reached end of u1 or end of u2 if L2 = 0 then // u1=u2 exit else // u1u2 goto pos; until false else pos: // u2='' or u1>u2 result := 1 else neg: // u1='' or u1 return nil inc(c, tab.Block[0, c]); // branchless a..z -> A..Z if c <> Up[0] then continue; end else begin extra := utf8.Lookup[c]; if extra = UTF8_INVALID then exit; i := 0; repeat c := c shl 6; inc(c, ord(U[i])); inc(i); until i = extra; inc(U, extra); dec(c, utf8.Extra[extra].offset); if PtrUInt(c) <= UU_MAX then c := tab.Ucs4Upper(c); if c <> Up[0] then continue; end; // if we reached here, U^ and Up^ first UCS4 CodePoint do match u2 := U; up2 := @Up[1]; repeat if up2^ = 0 then begin result := u0; // found -> return position in U exit; end; c := byte(u2^); inc(u2); if c <= 127 then begin if c = 0 then exit; // not found -> return nil inc(c, tab.Block[0, c]); if c <> up2^ then goto nxt; inc(up2); end else begin extra := utf8.Lookup[c]; if extra = UTF8_INVALID then exit; // invalid input i := 0; repeat c := c shl 6; inc(c, ord(u2[i])); inc(i); until i = extra; inc(u2, extra); dec(c, utf8.Extra[extra].offset); if PtrUInt(c) <= UU_MAX then c := tab.Ucs4Upper(c); if c <> up2^ then goto nxt; inc(up2); end; until false; until false; end; const // reference 8-bit upper chars as in WinAnsi/CP1252 for NormToUpper/Lower[] WinAnsiToUp: array[138..255] of byte = ( 83, 139, 140, 141, 90, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 83, 155, 140, 157, 90, 89, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 65, 65, 65, 65, 65, 65, 198, 67, 69, 69, 69, 69, 73, 73, 73, 73, 68, 78, 79, 79, 79, 79, 79, 215, 79, 85, 85, 85, 85, 89, 222, 223, 65, 65, 65, 65, 65, 65, 198, 67, 69, 69, 69, 69, 73, 73, 73, 73, 68, 78, 79, 79, 79, 79, 79, 247, 79, 85, 85, 85, 85, 89, 222, 89); {$ifdef UU_COMPRESSED} // 1KB compressed buffer which renders into our 20,016 bytes UU[] array UU_: array[byte] of cardinal = ( $040019FD, $FF5A6024, $00855A00, $FFFFFFE0, $5A5201F0, $02E700E8, $FFE0AA5A, $E0045A4B, $5A790BFF, $045A0007, $A045A1FF, $DB1878BA, $01A82B01, $0145A000, $1DA45008, $041E5A80, $401DA450, $5A8F185A, $FFFFFED4, $590B5AC3, $0C5A84A4, $5314A453, $610008A4, $A4520F5A, $82F5A1A3, $F1EBB5AB, $5A44DDF7, $52105A84, $5A845AA4, $5A4A5AC4, $5A385AC6, $11A45217, $10ABA500, $45A00200, $4F5A4401, $0000B15A, $A05A4F04, $5A830145, $C65A0018, $5EBAA05A, $20245AC0, $85A1A452, $5BB55700, $00002A3F, $065A2A3F, $5B04A453, $1F055A40, $A11C02A1, $02A11E02, $3200012E, $45A10001, $C2000133, $3645A10C, $45A10001, $4F000135, $00550690, $000E5AA5, $A54B0CC2, $013165A1, $2845A100, $440000A5, $012FAC02, $00012D00, $F70A5144, $41000029, $000A5AA5, $2AC4A16B, $45A10D22, $2B0291FD, $85A10001, $1C00022A, $A129E700, $000226A5, $0D930008, $512A000C, $BB0D920A, $05270001, $0001B900, $0644145A, $25000107, $00280002, $120A5115, $F1F552A5, $54009C55, $A453AF5A, $5AC45A44, $0082000C, $5A208400, $FFDA00BB, $AD6EFFFF, $09DB15D5, $F045A100, $01E13201, $1201F000, $C10001C0, $45A10005, $C70001C2, $C5A10001, $CA0001D1, $01F80001, $A045A100, $01AA33BA, $0001B000, $D0000007, $001EFBFB, $A2FFFF8C, $0002A0AB, $85A15A83, $E0D0BAA2, $01F00BFF, $2E01F012, $04F004F2, $3645A02A, $240D45A0, $5A44A459, $847E5A40, $115A405A, $400002F1, $45A0115A, $CC5A400D, $1FD000C4, $01255507, $8202F000, $55F1F555, $00C955F4, $700001F8, $85A10200, $FFFFE792, $9C018193, $859E0181, $01819D01, $DB0181A4, $89C20181, $00C5F558, $3C90F1E4, $E5A18A04, $E5A10EE6, $5A40BAA2, $CC5A40CC, $01C50014, $A045B100, $45AAFFBA, $00000008, $5A070080, $04800023, $80002B5A, $80000604, $00000635, $BC2F5A0E, $00F75B6D, $D875A108, $050A30A7, $014A0009, $5604A200, $056A0001, $42000164, $00018006, $01700802, $7E070200, $A17E0001, $070080B5, $80080001, $00022635, $35860002, $C4304825, $810975A1, $01E3DBB5, $090010A5, $8004335A, $80053B5A, $5A07000F, $F1CA0137, $E4006C55, $84A501FF, $0001F000, $8E2A00F0, $5AEDC05B, $55F15B04, $002F55F4, $900001E6, $F5525201, $87FFD019, $5A1202F0, $000C5A84, $FFFFD5D5, $AA02A1D8, $1845A545, $5A84A453, $40A45328, $5A40CC5A, $FB5FC736, $445804BF, $305B045A, $01C1A000, $A182C5E0, $B1C5E245, $F4C5E345, $A4504E55, $A4504C77, $1E55F441, $C417A450, $405A445A, $588A9C5A, $5A405A84, $045B0406, $C05A445B, $592C2A5A, $C6F021A4, $6E55F49B, $01FC6000, $300070A5, $60FFFF68, $0970FF7C, $F1F55219, $FFE00655, $35F55257, $0001D800, $528A0270, $2255F1F5, $527F7FD0, $F20011F5, $00063B03, $B603F000, $E035F552, $01F057FF, $09F55206, $0001DE00, $5A720210, $020100F1, $0403075A, $0503045A, $0C5A0706, $F15A0803, $00000012, $03120103, $08675104, $5A0B0A09, $5A0D0C1B, $0F0E0C11, $1211100C, $140C0C13, $0C055A15, $00005A16, $0C0E0000, $5A191817, $1B1A0C31, $065A1D1C, $5A1F1E0C, $5A200C26, $22210C09, $230C0F5A, $0000175A, $240C0000, $250C205A, $000C0D5A, $00000000); procedure InitializeUU; var tmp: array[0..7000] of byte; // need only 6653 bytes begin // Uppercase Unicode table RLE + SynLZ decompression from 1KB to 20KB :) if (RleUnCompress(@tmp, @UU, SynLZdecompress1(@UU_, 1019, @tmp)) <> SizeOf(UU)) or (crc32c(0, @UU, SizeOf(UU)) <> $7343D053) then raise ESynUnicode.Create('UU Table Decompression Failed'); // paranoid end; {$endif UU_COMPRESSED} (* procedure doUU; var tmp1, tmp2: array[0..5500] of cardinal; rle, lz, i: PtrInt; l: RawUtf8; begin rle := RleCompress(@UU, @tmp1, SizeOf(UU), SizeOf(tmp1)); lz := SynLZCompress1(@tmp1, rle, @tmp2); writeln(SizeOf(UU)); writeln(rle); writeln(lz); writeln('UU_ = array[byte] of cardinal = ('); l := ' '; for i := 0 to 255 do begin l := l + '$' + HexStr(tmp2[i], 8) + ','; if length(l) > 70 then begin writeln(l); l := ' '; end; end; writeln(l, ');'); end; *) procedure InitializeUnit; var i: PtrInt; c: AnsiChar; begin // decompress 1KB static in the exe into 20KB UU[] array for Unicode Uppercase {$ifdef UU_COMPRESSED} InitializeUU; {$endif UU_COMPRESSED} // initialize internal lookup tables for various text conversions for i := 0 to 255 do NormToNormByte[i] := i; NormToUpperAnsi7Byte := NormToNormByte; for i := ord('a') to ord('z') do dec(NormToUpperAnsi7Byte[i], 32); NormToLowerAnsi7Byte := NormToNormByte; for i := ord('A') to ord('Z') do inc(NormToLowerAnsi7Byte[i], 32); MoveFast(NormToUpperAnsi7, NormToUpper, 138); MoveFast(WinAnsiToUp, NormToUpperByte[138], SizeOf(WinAnsiToUp)); for i := 0 to 255 do begin c := NormToUpper[AnsiChar(i)]; if c in ['A'..'Z'] then inc(c, 32); NormToLower[AnsiChar(i)] := c; end; for c := low(c) to high(c) do begin if not (c in [#0, #10, #13]) then include(TEXT_CHARS[c], tcNot01013); if c in [#10, #13] then include(TEXT_CHARS[c], tc1013); if c in ['0'..'9', 'a'..'z', 'A'..'Z'] then include(TEXT_CHARS[c], tcWord); if c in ['_', 'a'..'z', 'A'..'Z'] then include(TEXT_CHARS[c], tcIdentifierFirstChar); if c in ['_', '0'..'9', 'a'..'z', 'A'..'Z'] then include(TEXT_CHARS[c], tcIdentifier); if c in ['_', '-', '.', '0'..'9', 'a'..'z', 'A'..'Z'] then // '~' is part of the RFC 3986 but should be escaped in practice // see https://blog.synopse.info/?post/2020/08/11/The-RFC%2C-The-URI%2C-and-The-Tilde include(TEXT_CHARS[c], tcUriUnreserved); if c in [#1..#9, #11, #12, #14..' '] then include(TEXT_CHARS[c], tcCtrlNotLF); if c in [#1..' ', ';'] then include(TEXT_CHARS[c], tcCtrlNot0Comma); end; // setup sorting functions redirection StrCompByCase[false] := @StrComp; StrCompByCase[true] := @StrIComp; {$ifdef CPUINTEL} SortDynArrayAnsiStringByCase[false] := @SortDynArrayAnsiString; {$else} SortDynArrayAnsiStringByCase[false] := @SortDynArrayRawByteString; {$endif CPUINTEL} SortDynArrayAnsiStringByCase[true] := @SortDynArrayAnsiStringI; IdemPropNameUSameLen[false] := @IdemPropNameUSameLenNotNull; IdemPropNameUSameLen[true] := @mormot.core.base.CompareMem; // setup basic Unicode conversion engines SetLength(SynAnsiConvertListCodePage, 16); // no resize -> more thread safe CurrentAnsiConvert := NewEngine(Unicode_CodePage); WinAnsiConvert := NewEngine(CP_WINANSI) as TSynAnsiFixedWidth; Utf8AnsiConvert := NewEngine(CP_UTF8) as TSynAnsiUtf8; RawByteStringConvert := NewEngine(CP_RAWBYTESTRING) as TSynAnsiFixedWidth; // setup optimized ASM functions IsValidUtf8Buffer := @IsValidUtf8Pas; {$ifdef ASMX64AVXNOCONST} if cpuHaswell in X64CpuFeatures then // Haswell CPUs can use simdjson AVX2 asm for IsValidUtf8() IsValidUtf8Buffer := @IsValidUtf8Avx2; {$endif ASMX64AVXNOCONST} end; initialization InitializeUnit; end.