mirror of
https://github.com/danieleteti/delphimvcframework.git
synced 2024-11-15 15:55:54 +01:00
10432 lines
338 KiB
ObjectPascal
10432 lines
338 KiB
ObjectPascal
/// Framework Core Low-Level Unicode UTF-8 UTF-16 Ansi Conversion
|
|
// - this unit is a part of the Open Source Synopse mORMot framework 2,
|
|
// licensed under a MPL/GPL/LGPL three license - see LICENSE.md
|
|
unit mormot.core.unicode;
|
|
|
|
{
|
|
*****************************************************************************
|
|
|
|
Efficient Unicode Conversion Classes shared by all framework units
|
|
- UTF-8 Efficient Encoding / Decoding
|
|
- UTF-8 / UTF-16 / Ansi Conversion Classes
|
|
- Text File Loading with BOM/Unicode Support
|
|
- Low-Level String Conversion Functions
|
|
- Text Case-(in)sensitive Conversion and Comparison
|
|
- UTF-8 String Manipulation Functions
|
|
- TRawUtf8DynArray Processing Functions
|
|
- Operating-System Independent Unicode Process
|
|
|
|
*****************************************************************************
|
|
}
|
|
|
|
interface
|
|
|
|
{$I mormot.defines.inc}
|
|
|
|
uses
|
|
classes,
|
|
sysutils,
|
|
mormot.core.base,
|
|
mormot.core.os;
|
|
|
|
|
|
{ *************** UTF-8 Efficient Encoding / Decoding }
|
|
|
|
// some constants used for UTF-8 conversion, including surrogates
|
|
type
|
|
// see http://floodyberry.wordpress.com/2007/04/14/utf-8-conversion-tricks
|
|
{$ifdef USERECORDWITHMETHODS}
|
|
TUtf8Table = record
|
|
{$else}
|
|
TUtf8Table = object
|
|
{$endif USERECORDWITHMETHODS}
|
|
public
|
|
Lookup: array[byte] of byte;
|
|
Extra: array[0..6] of record
|
|
offset, minimum: cardinal;
|
|
end;
|
|
FirstByte: array[2..6] of byte;
|
|
/// retrieve a >127 UCS4 CodePoint from UTF-8
|
|
function GetHighUtf8Ucs4(var U: PUtf8Char): Ucs4CodePoint;
|
|
end;
|
|
PUtf8Table = ^TUtf8Table;
|
|
|
|
const
|
|
UTF8_ASCII = 0;
|
|
UTF8_INVALID = 6;
|
|
UTF8_ZERO = 7;
|
|
UTF8_TABLE: TUtf8Table = (
|
|
Lookup: (
|
|
7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
|
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
|
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
|
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6, 6);
|
|
Extra: (
|
|
(offset: $00000000; minimum: $00010000),
|
|
(offset: $00003080; minimum: $00000080),
|
|
(offset: $000e2080; minimum: $00000800),
|
|
(offset: $03c82080; minimum: $00010000),
|
|
(offset: $fa082080; minimum: $00200000),
|
|
(offset: $82082080; minimum: $04000000),
|
|
(offset: $00000000; minimum: $04000000));
|
|
FirstByte: (
|
|
$c0, $e0, $f0, $f8, $fc));
|
|
|
|
UTF8_EXTRA_SURROGATE = 3;
|
|
UTF16_HISURROGATE_MIN = $d800;
|
|
UTF16_HISURROGATE_MAX = $dbff;
|
|
UTF16_LOSURROGATE_MIN = $dc00;
|
|
UTF16_LOSURROGATE_MAX = $dfff;
|
|
UTF16_SURROGATE_OFFSET = $d7c0;
|
|
|
|
/// replace any incoming character whose value is unrepresentable in Unicode
|
|
// - set e.g. by GetUtf8WideChar(), Utf8UpperReference() or
|
|
// RawUnicodeToUtf8() when ccfReplacementCharacterForUnmatchedSurrogate is set
|
|
// - encoded as $ef $bf $bd bytes in UTF-8
|
|
UNICODE_REPLACEMENT_CHARACTER = $fffd;
|
|
|
|
|
|
/// internal function, used to retrieve a >127 US4 CodePoint from UTF-8
|
|
// - not to be called directly, but from inlined higher-level functions
|
|
// - here U^ shall be always >= #80
|
|
// - typical use is as such:
|
|
// ! ch := ord(P^);
|
|
// ! if ch and $80=0 then
|
|
// ! inc(P) else
|
|
// ! ch := GetHighUtf8Ucs4(P);
|
|
function GetHighUtf8Ucs4(var U: PUtf8Char): Ucs4CodePoint;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// decode UTF-16 WideChar from UTF-8 input buffer
|
|
// - any surrogate (Ucs4>$ffff) is returned as UNICODE_REPLACEMENT_CHARACTER=$fffd
|
|
function GetUtf8WideChar(P: PUtf8Char): cardinal;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// get the UCS4 CodePoint stored in P^ (decode UTF-8 if necessary)
|
|
function NextUtf8Ucs4(var P: PUtf8Char): Ucs4CodePoint;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// UTF-8 encode one UTF-16 encoded UCS4 CodePoint into Dest
|
|
// - return the number of bytes written into Dest (i.e. from 1 up to 6)
|
|
// - Source will contain the next UTF-16 character
|
|
// - this method DOES properly handle UTF-16 surrogate pairs
|
|
function Utf16CharToUtf8(Dest: PUtf8Char; var Source: PWord): integer;
|
|
|
|
/// UTF-8 encode one UCS4 CodePoint into Dest
|
|
// - return the number of bytes written into Dest (i.e. from 1 up to 6)
|
|
// - this method DOES properly handle UTF-16 surrogate pairs
|
|
function Ucs4ToUtf8(ucs4: Ucs4CodePoint; Dest: PUtf8Char): PtrInt;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
type
|
|
/// option set for RawUnicodeToUtf8() conversion
|
|
TCharConversionFlags = set of (
|
|
ccfNoTrailingZero,
|
|
ccfReplacementCharacterForUnmatchedSurrogate);
|
|
|
|
/// convert a UTF-16 PWideChar buffer into a UTF-8 string
|
|
procedure RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer;
|
|
var result: RawUtf8; Flags: TCharConversionFlags = [ccfNoTrailingZero]); overload;
|
|
|
|
/// convert a UTF-16 PWideChar buffer into a UTF-8 temporary buffer
|
|
procedure RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer;
|
|
var result: TSynTempBuffer; Flags: TCharConversionFlags); overload;
|
|
|
|
/// convert a UTF-16 PWideChar buffer into a UTF-8 string
|
|
function RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer;
|
|
Flags: TCharConversionFlags = [ccfNoTrailingZero]): RawUtf8; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert a UTF-16 PWideChar buffer into a UTF-8 buffer
|
|
// - replace system.UnicodeToUtf8 implementation, which is rather slow
|
|
// since Delphi 2009+
|
|
// - append a trailing #0 to the ending PUtf8Char, unless ccfNoTrailingZero is set
|
|
// - if ccfReplacementCharacterForUnmatchedSurrogate is set, this function will identify
|
|
// unmatched surrogate pairs and replace them with UNICODE_REPLACEMENT_CHARACTER -
|
|
// see https://en.wikipedia.org/wiki/Specials_(Unicode_block)
|
|
function RawUnicodeToUtf8(Dest: PUtf8Char; DestLen: PtrInt;
|
|
Source: PWideChar; SourceLen: PtrInt; Flags: TCharConversionFlags): PtrInt; overload;
|
|
|
|
/// convert a UTF-16 PWideChar buffer into a UTF-8 string
|
|
// - this version doesn't resize the resulting RawUtf8 string, but return
|
|
// the new resulting RawUtf8 byte count into Utf8Length
|
|
function RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer;
|
|
out Utf8Length: integer): RawUtf8; overload;
|
|
|
|
|
|
/// convert an UTF-8 encoded text into a WideChar (UTF-16) buffer
|
|
// - faster than System.Utf8ToUnicode
|
|
// - sourceBytes can by 0, therefore length is computed from zero terminated source
|
|
// - enough place must be available in dest buffer (guess is sourceBytes*3+2)
|
|
// - a WideChar(#0) is added at the end (if something is written) unless
|
|
// NoTrailingZero is TRUE
|
|
// - returns the BYTE count written in dest, excluding the ending WideChar(#0)
|
|
function Utf8ToWideChar(dest: PWideChar; source: PUtf8Char; sourceBytes: PtrInt = 0;
|
|
NoTrailingZero: boolean = false): PtrInt; overload;
|
|
|
|
/// convert an UTF-8 encoded text into a WideChar (UTF-16) buffer
|
|
// - faster than System.Utf8ToUnicode
|
|
// - this overloaded function expect a MaxDestChars parameter
|
|
// - sourceBytes can not be 0 for this function
|
|
// - enough place must be available in dest buffer (guess is sourceBytes*3+2)
|
|
// - a WideChar(#0) is added at the end (if something is written) unless
|
|
// NoTrailingZero is TRUE
|
|
// - returns the BYTE COUNT (not WideChar count) written in dest, excluding the
|
|
// ending WideChar(#0)
|
|
function Utf8ToWideChar(dest: PWideChar; source: PUtf8Char;
|
|
MaxDestChars, sourceBytes: PtrInt; NoTrailingZero: boolean = false): PtrInt; overload;
|
|
|
|
/// direct conversion of a UTF-8 encoded buffer into a WinAnsi ShortString buffer
|
|
// - non WinAnsi chars are replaced by '?' placeholders
|
|
procedure Utf8ToShortString(var dest: ShortString; source: PUtf8Char);
|
|
|
|
/// calculate the UTF-16 Unicode characters count, UTF-8 encoded in source^
|
|
// - count may not match the UCS4 CodePoint, in case of UTF-16 surrogates
|
|
// - faster than System.Utf8ToUnicode with dest=nil
|
|
function Utf8ToUnicodeLength(source: PUtf8Char): PtrUInt;
|
|
|
|
/// returns TRUE if the supplied buffer has valid UTF-8 encoding
|
|
// - will also refuse #0 characters within the buffer
|
|
// - on Haswell AVX2 Intel/AMD CPUs, will use very efficient ASM
|
|
var
|
|
IsValidUtf8Buffer: function(source: PUtf8Char; sourcelen: PtrInt): boolean;
|
|
|
|
/// returns TRUE if the supplied buffer has valid UTF-8 encoding
|
|
// - will also refuse #0 characters within the buffer
|
|
// - on Haswell AVX2 Intel/AMD CPUs, will use very efficient ASM, reaching e.g.
|
|
// 21 GB/s parsing speed on a Core i5-13500
|
|
function IsValidUtf8(const source: RawUtf8): boolean; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// returns TRUE if the supplied buffer has valid UTF-8 encoding
|
|
// - will stop when the buffer contains #0
|
|
// - just a wrapper around IsValidUtf8Buffer(source, StrLen(source)) so if you
|
|
// know the source length, you would better call IsValidUtf8Buffer() directly
|
|
// - on Haswell AVX2 Intel/AMD CPUs, will use very efficient ASM, reaching e.g.
|
|
// 15 GB/s parsing speed on a Core i5-13500 - StrLen() itself runs at 37 GB/s
|
|
function IsValidUtf8(source: PUtf8Char): boolean; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// detect UTF-8 content and mark the variable with the CP_UTF8 codepage
|
|
// - to circumvent FPC concatenation bug with CP_UTF8 and CP_RAWBYTESTRING
|
|
procedure DetectRawUtf8(var source: RawByteString);
|
|
{$ifndef HASCODEPAGE}{$ifdef HASINLINE}inline;{$endif}{$endif}
|
|
|
|
/// returns TRUE if the supplied buffer has valid UTF-8 encoding with no #1..#31
|
|
// control characters
|
|
// - supplied input is a pointer to a #0 ended text buffer
|
|
function IsValidUtf8WithoutControlChars(source: PUtf8Char): boolean; overload;
|
|
|
|
/// returns TRUE if the supplied buffer has valid UTF-8 encoding with no #0..#31
|
|
// control characters
|
|
// - supplied input is a RawUtf8 variable
|
|
function IsValidUtf8WithoutControlChars(const source: RawUtf8): boolean; overload;
|
|
|
|
/// will truncate the supplied UTF-8 value if its length exceeds the specified
|
|
// UTF-16 Unicode characters count
|
|
// - count may not match the UCS4 CodePoint, in case of UTF-16 surrogates
|
|
// - returns FALSE if text was not truncated, TRUE otherwise
|
|
function Utf8TruncateToUnicodeLength(var text: RawUtf8; maxUtf16: integer): boolean;
|
|
|
|
/// will truncate the supplied UTF-8 value if its length exceeds the specified
|
|
// bytes count
|
|
// - this function will ensure that the returned content will contain only valid
|
|
// UTF-8 sequence, i.e. will trim the whole trailing UTF-8 sequence
|
|
// - returns FALSE if text was not truncated, TRUE otherwise
|
|
function Utf8TruncateToLength(var text: RawUtf8; maxBytes: PtrUInt): boolean;
|
|
|
|
/// compute the truncated length of the supplied UTF-8 value if it exceeds the
|
|
// specified bytes count
|
|
// - this function will ensure that the returned content will contain only valid
|
|
// UTF-8 sequence, i.e. will trim the whole trailing UTF-8 sequence
|
|
// - returns maxBytes if text was not truncated, or the number of fitting bytes
|
|
function Utf8TruncatedLength(const text: RawUtf8; maxBytes: PtrUInt): PtrInt; overload;
|
|
|
|
/// compute the truncated length of the supplied UTF-8 value if it exceeds the
|
|
// specified bytes count
|
|
// - this function will ensure that the returned content will contain only valid
|
|
// UTF-8 sequence, i.e. will trim the whole trailing UTF-8 sequence
|
|
// - returns maxBytes if text was not truncated, or the number of fitting bytes
|
|
function Utf8TruncatedLength(text: PAnsiChar;
|
|
textlen, maxBytes: PtrUInt): PtrInt; overload;
|
|
|
|
/// calculate the UTF-16 Unicode characters count of the UTF-8 encoded first line
|
|
// - count may not match the UCS4 CodePoint, in case of UTF-16 surrogates
|
|
// - end the parsing at first #13 or #10 character
|
|
function Utf8FirstLineToUtf16Length(source: PUtf8Char): PtrInt;
|
|
|
|
|
|
|
|
{ **************** UTF-8 / UTF-16 / Ansi Conversion Classes }
|
|
|
|
type
|
|
/// Exception raised by this unit in case of fatal conversion issue
|
|
ESynUnicode = class(ExceptionWithProps);
|
|
|
|
/// an abstract class to handle Ansi to/from Unicode translation
|
|
// - implementations of this class will handle efficiently all Code Pages
|
|
// - this default implementation will use the Operating System APIs
|
|
// - you should not create your own class instance by yourself, but should
|
|
// better retrieve an instance using TSynAnsiConvert.Engine(), which will
|
|
// initialize either a TSynAnsiFixedWidth or a TSynAnsiConvert instance on need
|
|
TSynAnsiConvert = class
|
|
protected
|
|
fCodePage: cardinal;
|
|
fAnsiCharShift: byte;
|
|
public
|
|
/// initialize the internal conversion engine
|
|
constructor Create(aCodePage: cardinal); reintroduce; virtual;
|
|
/// returns the engine corresponding to a given code page
|
|
// - a global list of TSynAnsiConvert instances is handled by the unit -
|
|
// therefore, caller should not release the returned instance
|
|
// - will return nil in case of unhandled code page
|
|
// - is aCodePage is 0, will return CurrentAnsiConvert value
|
|
class function Engine(aCodePage: cardinal): TSynAnsiConvert;
|
|
/// direct conversion of a PAnsiChar buffer into an Unicode buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars*2 bytes
|
|
// - this default implementation will use the Operating System APIs
|
|
// - will append a trailing #0 to the returned PWideChar, unless
|
|
// NoTrailingZero is set
|
|
function AnsiBufferToUnicode(Dest: PWideChar; Source: PAnsiChar;
|
|
SourceChars: cardinal; NoTrailingZero: boolean = false): PWideChar; overload; virtual;
|
|
/// direct conversion of a PAnsiChar buffer into a UTF-8 encoded buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars*3 bytes
|
|
// - will append a trailing #0 to the returned PUtf8Char, unless
|
|
// NoTrailingZero is set
|
|
// - this default implementation will use the Operating System APIs
|
|
function AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar;
|
|
SourceChars: cardinal; NoTrailingZero: boolean = false): PUtf8Char; overload; virtual;
|
|
{$ifndef PUREMORMOT2}
|
|
/// convert any Ansi Text into an UTF-16 Unicode String
|
|
// - returns a value using our RawUnicode kind of string
|
|
function AnsiToRawUnicode(const AnsiText: RawByteString): RawUnicode; overload;
|
|
/// convert any Ansi buffer into an Unicode String
|
|
// - returns a value using our RawUnicode kind of string
|
|
function AnsiToRawUnicode(
|
|
Source: PAnsiChar; SourceChars: cardinal): RawUnicode; overload; virtual;
|
|
{$endif PUREMORMOT2}
|
|
/// convert any Ansi buffer into an Unicode String
|
|
// - returns a SynUnicode, i.e. Delphi 2009+ UnicodeString or a WideString
|
|
function AnsiToUnicodeString(
|
|
Source: PAnsiChar; SourceChars: cardinal): SynUnicode; overload;
|
|
/// convert any Ansi buffer into an Unicode String
|
|
// - returns a SynUnicode, i.e. Delphi 2009+ UnicodeString or a WideString
|
|
function AnsiToUnicodeString(const Source: RawByteString): SynUnicode; overload;
|
|
/// convert any Ansi Text into an UTF-8 encoded String
|
|
// - internally calls AnsiBufferToUtf8 virtual method
|
|
function AnsiToUtf8(const AnsiText: RawByteString): RawUtf8; virtual;
|
|
/// direct conversion of a PAnsiChar buffer into a UTF-8 encoded string
|
|
// - will call AnsiBufferToUnicode() overloaded virtual method
|
|
procedure AnsiBufferToRawUtf8(Source: PAnsiChar;
|
|
SourceChars: cardinal; out Value: RawUtf8); overload; virtual;
|
|
/// direct conversion of an Unicode buffer into a PAnsiChar buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars * 3 bytes
|
|
// - will detect and ignore any trailing UTF-16LE BOM marker
|
|
// - this default implementation will rely on the Operating System for
|
|
// all non ASCII-7 chars
|
|
function UnicodeBufferToAnsi(Dest: PAnsiChar; Source: PWideChar;
|
|
SourceChars: cardinal): PAnsiChar; overload; virtual;
|
|
/// direct conversion of an Unicode buffer into an Ansi Text
|
|
function UnicodeBufferToAnsi(Source: PWideChar;
|
|
SourceChars: cardinal): RawByteString; overload; virtual;
|
|
/// convert any Unicode-encoded String into Ansi Text
|
|
// - internally calls UnicodeBufferToAnsi virtual method
|
|
function UnicodeStringToAnsi(const Source: SynUnicode): RawByteString;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
{$ifndef PUREMORMOT2}
|
|
/// convert any Unicode-encoded String into Ansi Text
|
|
// - internally calls UnicodeBufferToAnsi virtual method
|
|
function RawUnicodeToAnsi(const Source: RawUnicode): RawByteString;
|
|
{$endif PUREMORMOT2}
|
|
/// direct conversion of an UTF-8 encoded buffer into a PAnsiChar buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars bytes
|
|
// - no trailing #0 is appended to the buffer
|
|
function Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char;
|
|
SourceChars: cardinal): PAnsiChar; overload; virtual;
|
|
/// convert any UTF-8 encoded buffer into Ansi Text
|
|
// - internally calls Utf8BufferToAnsi virtual method
|
|
function Utf8BufferToAnsi(Source: PUtf8Char;
|
|
SourceChars: cardinal): RawByteString; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
/// convert any UTF-8 encoded buffer into Ansi Text
|
|
// - internally calls Utf8BufferToAnsi virtual method
|
|
procedure Utf8BufferToAnsi(Source: PUtf8Char;
|
|
SourceChars: cardinal; var result: RawByteString); overload; virtual;
|
|
/// convert any UTF-8 encoded String into Ansi Text
|
|
// - internally calls Utf8BufferToAnsi virtual method
|
|
function Utf8ToAnsi(const u: RawUtf8): RawByteString; virtual;
|
|
/// direct conversion of a UTF-8 encoded string into a WinAnsi <2KB buffer
|
|
// - will truncate the destination string to DestSize bytes (including the
|
|
// trailing #0), with a maximum handled size of 2048 bytes
|
|
// - returns the number of bytes stored in Dest^ (i.e. the position of #0)
|
|
function Utf8ToAnsiBuffer2K(const S: RawUtf8;
|
|
Dest: PAnsiChar; DestSize: integer): integer;
|
|
/// convert any Ansi Text (providing a From converted) into Ansi Text
|
|
function AnsiToAnsi(From: TSynAnsiConvert;
|
|
const Source: RawByteString): RawByteString; overload;
|
|
/// convert any Ansi buffer (providing a From converted) into Ansi Text
|
|
function AnsiToAnsi(From: TSynAnsiConvert; Source: PAnsiChar;
|
|
SourceChars: cardinal): RawByteString; overload;
|
|
/// corresponding code page
|
|
property CodePage: cardinal
|
|
read fCodePage;
|
|
/// corresponding length binary shift used for worst conversion case
|
|
property AnsiCharShift: byte
|
|
read fAnsiCharShift;
|
|
end;
|
|
|
|
/// a class to handle Ansi to/from Unicode translation of fixed width encoding
|
|
// (i.e. non MBCS)
|
|
// - this class will handle efficiently all Code Page availables without MBCS
|
|
// encoding - like WinAnsi (1252) or Russian (1251)
|
|
// - it will use internal fast look-up tables for such encodings
|
|
// - this class could take some time to generate, and will consume more than
|
|
// 64 KB of memory: you should not create your own class instance by yourself,
|
|
// but should better retrieve an instance using TSynAnsiConvert.Engine(), which
|
|
// will initialize either a TSynAnsiFixedWidth or a TSynAnsiConvert instance
|
|
// on need
|
|
// - this class has some additional methods (e.g. IsValid*) which take
|
|
// advantage of the internal lookup tables to provide some fast process
|
|
TSynAnsiFixedWidth = class(TSynAnsiConvert)
|
|
protected
|
|
fAnsiToWide: TWordDynArray;
|
|
fWideToAnsi: TByteDynArray;
|
|
public
|
|
/// initialize the internal conversion engine
|
|
constructor Create(aCodePage: cardinal); override;
|
|
/// direct conversion of a PAnsiChar buffer into an Unicode buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars*2 bytes
|
|
// - will append a trailing #0 to the returned PWideChar, unless
|
|
// NoTrailingZero is set
|
|
function AnsiBufferToUnicode(Dest: PWideChar; Source: PAnsiChar;
|
|
SourceChars: cardinal; NoTrailingZero: boolean = false): PWideChar; override;
|
|
/// direct conversion of a PAnsiChar buffer into a UTF-8 encoded buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars*3 bytes
|
|
// - will append a trailing #0 to the returned PUtf8Char, unless
|
|
// NoTrailingZero is set
|
|
function AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar;
|
|
SourceChars: cardinal; NoTrailingZero: boolean = false): PUtf8Char; override;
|
|
{$ifndef PUREMORMOT2}
|
|
/// convert any Ansi buffer into an Unicode String
|
|
// - returns a value using our RawUnicode kind of string
|
|
function AnsiToRawUnicode(Source: PAnsiChar;
|
|
SourceChars: cardinal): RawUnicode; override;
|
|
{$endif PUREMORMOT2}
|
|
/// direct conversion of an Unicode buffer into a PAnsiChar buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars * 3 bytes
|
|
// - will detect and ignore any trailing UTF-16LE BOM marker
|
|
// - this overridden version will use internal lookup tables for fast process
|
|
function UnicodeBufferToAnsi(Dest: PAnsiChar;
|
|
Source: PWideChar; SourceChars: cardinal): PAnsiChar; override;
|
|
/// direct conversion of an UTF-8 encoded buffer into a PAnsiChar buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars bytes
|
|
// - no trailing #0 is appended to the buffer
|
|
// - non Ansi compatible characters are replaced as '?'
|
|
function Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char;
|
|
SourceChars: cardinal): PAnsiChar; override;
|
|
/// conversion of a wide char into the corresponding Ansi character
|
|
// - return -1 for an unknown WideChar in the current code page
|
|
function WideCharToAnsiChar(wc: cardinal): integer;
|
|
/// return TRUE if the supplied unicode buffer only contains characters of
|
|
// the corresponding Ansi code page
|
|
// - i.e. if the text can be displayed using this code page
|
|
function IsValidAnsi(WideText: PWideChar; Length: PtrInt): boolean; overload;
|
|
/// return TRUE if the supplied unicode buffer only contains characters of
|
|
// the corresponding Ansi code page
|
|
// - i.e. if the text can be displayed using this code page
|
|
function IsValidAnsi(WideText: PWideChar): boolean; overload;
|
|
/// return TRUE if the supplied UTF-8 buffer only contains characters of
|
|
// the corresponding Ansi code page
|
|
// - i.e. if the text can be displayed using this code page
|
|
function IsValidAnsiU(Utf8Text: PUtf8Char): boolean;
|
|
/// return TRUE if the supplied UTF-8 buffer only contains 8-bit characters
|
|
// of the corresponding Ansi code page
|
|
// - i.e. if the text can be displayed with only 8-bit unicode characters
|
|
// (e.g. no "tm" or such) within this code page
|
|
function IsValidAnsiU8Bit(Utf8Text: PUtf8Char): boolean;
|
|
/// direct access to the Ansi-To-Unicode lookup table
|
|
// - use this array like AnsiToWide: array[byte] of word
|
|
property AnsiToWide: TWordDynArray
|
|
read fAnsiToWide;
|
|
/// direct access to the Unicode-To-Ansi lookup table
|
|
// - use this array like WideToAnsi: array[word] of byte
|
|
// - any unhandled WideChar will return ord('?')
|
|
property WideToAnsi: TByteDynArray
|
|
read fWideToAnsi;
|
|
end;
|
|
|
|
/// a class to handle UTF-8 to/from Unicode translation
|
|
// - match the TSynAnsiConvert signature, for code page CP_UTF8
|
|
// - this class is mostly a non-operation for conversion to/from UTF-8
|
|
TSynAnsiUtf8 = class(TSynAnsiConvert)
|
|
public
|
|
/// initialize the internal conversion engine
|
|
constructor Create(aCodePage: cardinal); override;
|
|
/// direct conversion of a PAnsiChar UTF-8 buffer into an Unicode buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars*2 bytes
|
|
// - will append a trailing #0 to the returned PWideChar, unless
|
|
// NoTrailingZero is set
|
|
function AnsiBufferToUnicode(Dest: PWideChar; Source: PAnsiChar;
|
|
SourceChars: cardinal; NoTrailingZero: boolean = false): PWideChar; override;
|
|
/// direct conversion of a PAnsiChar UTF-8 buffer into a UTF-8 encoded buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars*3 bytes
|
|
// - will append a trailing #0 to the returned PUtf8Char, unless
|
|
// NoTrailingZero is set
|
|
function AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar;
|
|
SourceChars: cardinal; NoTrailingZero: boolean = false): PUtf8Char; override;
|
|
{$ifndef PUREMORMOT2}
|
|
/// convert any UTF-8 Ansi buffer into an Unicode String
|
|
// - returns a value using our RawUnicode kind of string
|
|
function AnsiToRawUnicode(Source: PAnsiChar;
|
|
SourceChars: cardinal): RawUnicode; override;
|
|
{$endif PUREMORMOT2}
|
|
/// direct conversion of an Unicode buffer into a PAnsiChar UTF-8 buffer
|
|
// - will detect and ignore any trailing UTF-16LE BOM marker
|
|
// - Dest^ buffer must be reserved with at least SourceChars * 3 bytes
|
|
function UnicodeBufferToAnsi(Dest: PAnsiChar; Source: PWideChar;
|
|
SourceChars: cardinal): PAnsiChar; override;
|
|
/// direct conversion of an Unicode buffer into an Ansi Text
|
|
function UnicodeBufferToAnsi(Source: PWideChar;
|
|
SourceChars: cardinal): RawByteString; override;
|
|
/// direct conversion of an UTF-8 encoded buffer into a PAnsiChar UTF-8 buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars bytes
|
|
// - no trailing #0 is appended to the buffer
|
|
function Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char;
|
|
SourceChars: cardinal): PAnsiChar; override;
|
|
/// convert any UTF-8 encoded buffer into Ansi Text
|
|
procedure Utf8BufferToAnsi(Source: PUtf8Char; SourceChars: cardinal;
|
|
var result: RawByteString); override;
|
|
/// convert any UTF-8 encoded String into Ansi Text
|
|
// - directly assign the input as result, since no conversion is needed
|
|
function Utf8ToAnsi(const u: RawUtf8): RawByteString; override;
|
|
/// convert any Ansi Text into an UTF-8 encoded String
|
|
// - directly assign the input as result, since no conversion is needed
|
|
function AnsiToUtf8(const AnsiText: RawByteString): RawUtf8; override;
|
|
/// direct conversion of a PAnsiChar buffer into a UTF-8 encoded string
|
|
procedure AnsiBufferToRawUtf8(Source: PAnsiChar;
|
|
SourceChars: cardinal; out Value: RawUtf8); override;
|
|
end;
|
|
|
|
/// a class to handle UTF-16 to/from Unicode translation
|
|
// - match the TSynAnsiConvert signature, for code page CP_UTF16
|
|
// - even if UTF-16 is not an Ansi format, code page CP_UTF16 may have been
|
|
// used to store UTF-16 encoded binary content
|
|
// - this class is mostly a non-operation for conversion to/from Unicode
|
|
TSynAnsiUtf16 = class(TSynAnsiConvert)
|
|
public
|
|
/// initialize the internal conversion engine
|
|
constructor Create(aCodePage: cardinal); override;
|
|
/// direct conversion of a PAnsiChar UTF-16 buffer into an Unicode buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars*2 bytes
|
|
// - will append a trailing #0 to the returned PWideChar, unless
|
|
// NoTrailingZero is set
|
|
function AnsiBufferToUnicode(Dest: PWideChar; Source: PAnsiChar;
|
|
SourceChars: cardinal; NoTrailingZero: boolean = false): PWideChar; override;
|
|
/// direct conversion of a PAnsiChar UTF-16 buffer into a UTF-8 encoded buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars*3 bytes
|
|
// - will append a trailing #0 to the returned PUtf8Char, unless
|
|
// NoTrailingZero is set
|
|
function AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar;
|
|
SourceChars: cardinal; NoTrailingZero: boolean = false): PUtf8Char; override;
|
|
{$ifndef PUREMORMOT2}
|
|
/// convert any UTF-16 Ansi buffer into an Unicode String
|
|
// - returns a value using our RawUnicode kind of string
|
|
function AnsiToRawUnicode(Source: PAnsiChar;
|
|
SourceChars: cardinal): RawUnicode; override;
|
|
{$endif PUREMORMOT2}
|
|
/// direct conversion of an Unicode buffer into a PAnsiChar UTF-16 buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars * 3 bytes
|
|
function UnicodeBufferToAnsi(Dest: PAnsiChar; Source: PWideChar;
|
|
SourceChars: cardinal): PAnsiChar; override;
|
|
/// direct conversion of an UTF-8 encoded buffer into a PAnsiChar UTF-16 buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars bytes
|
|
// - no trailing #0 is appended to the buffer
|
|
function Utf8BufferToAnsi(Dest: PAnsiChar; Source: PUtf8Char;
|
|
SourceChars: cardinal): PAnsiChar; override;
|
|
end;
|
|
|
|
|
|
var
|
|
/// global TSynAnsiConvert instance to handle WinAnsi encoding (code page 1252)
|
|
// - this instance is global and instantied during the whole program life time
|
|
// - it will be created from hard-coded values, and not using the system API,
|
|
// since it appeared that some systems (e.g. in Russia) did tweak the registry
|
|
// so that 1252 code page maps 1251 code page
|
|
WinAnsiConvert: TSynAnsiFixedWidth;
|
|
|
|
/// global TSynAnsiConvert instance to handle current system encoding
|
|
// - this is the encoding as used by the AnsiString type, so will be used
|
|
// before Delphi 2009 to speed-up RTL string handling (especially for UTF-8)
|
|
// - this instance is global and instantied during the whole program life time
|
|
CurrentAnsiConvert: TSynAnsiConvert;
|
|
|
|
/// global TSynAnsiConvert instance to handle UTF-8 encoding (code page CP_UTF8)
|
|
// - this instance is global and instantied during the whole program life time
|
|
Utf8AnsiConvert: TSynAnsiUtf8;
|
|
|
|
/// global TSynAnsiConvert instance with no encoding (RawByteString/RawBlob)
|
|
RawByteStringConvert: TSynAnsiFixedWidth;
|
|
|
|
/// check if a code page is known to be of fixed width, i.e. not MBCS
|
|
// - i.e. will be implemented as a TSynAnsiFixedWidth
|
|
function IsFixedWidthCodePage(aCodePage: cardinal): boolean;
|
|
|
|
/// return a code page number into human-friendly text
|
|
function CodePageToText(aCodePage: cardinal): TShort16;
|
|
|
|
|
|
{ *************** Text File Loading with BOM/Unicode Support }
|
|
|
|
type
|
|
/// text file layout, as returned by BomFile() and StringFromBomFile()
|
|
// - bomNone means there was no BOM recognized
|
|
// - bomUnicode stands for UTF-16 LE encoding (as on Windows products)
|
|
// - bomUtf8 stands for a UTF-8 BOM (as on Windows products)
|
|
TBomFile = (
|
|
bomNone,
|
|
bomUnicode,
|
|
bomUtf8);
|
|
|
|
const
|
|
/// UTF-16LE BOM WideChar marker, as existing e.g. in some UTF-16 Windows files
|
|
BOM_UTF16LE = #$FEFF;
|
|
|
|
/// check the file BOM at the beginning of a file buffer
|
|
// - BOM is common only with Microsoft products
|
|
// - returns bomNone if no BOM was recognized
|
|
// - returns bomUnicode or bomUtf8 if UTF-16LE or UTF-8 BOM were recognized:
|
|
// and will adjust Buffer/BufferSize to ignore the leading 2 or 3 bytes
|
|
function BomFile(var Buffer: pointer; var BufferSize: PtrInt): TBomFile;
|
|
|
|
/// read a file into a temporary variable, check the BOM, and adjust the buffer
|
|
function StringFromBomFile(const FileName: TFileName; out FileContent: RawByteString;
|
|
out Buffer: pointer; out BufferSize: PtrInt): TBomFile;
|
|
|
|
/// read a File content into a RawUtf8, detecting any leading BOM
|
|
// - will assume text file with no BOM is already UTF-8 encoded
|
|
// - an alternative to StringFromFile() if you want to handle UTF-8 content
|
|
// and the files are likely to be natively UTF-8 encoded, or with a BOM
|
|
function RawUtf8FromFile(const FileName: TFileName): RawUtf8;
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
|
|
/// read a File content into a RawUtf8, detecting any leading BOM
|
|
// - assume file with no BOM is encoded with the current Ansi code page, not
|
|
// UTF-8, unless AssumeUtf8IfNoBom is true and it behaves like RawUtf8FromFile()
|
|
function AnyTextFileToRawUtf8(const FileName: TFileName;
|
|
AssumeUtf8IfNoBom: boolean = false): RawUtf8;
|
|
|
|
/// read a File content into a RTL string, detecting any leading BOM
|
|
// - assume file with no BOM is encoded with the current Ansi code page, not UTF-8
|
|
// - if ForceUtf8 is true, won't detect the BOM but assume whole file is UTF-8
|
|
function AnyTextFileToString(const FileName: TFileName;
|
|
ForceUtf8: boolean = false): string;
|
|
|
|
/// read a File content into SynUnicode string, detecting any leading BOM
|
|
// - assume file with no BOM is encoded with the current Ansi code page, not UTF-8
|
|
// - if ForceUtf8 is true, won't detect the BOM but assume whole file is UTF-8
|
|
function AnyTextFileToSynUnicode(const FileName: TFileName;
|
|
ForceUtf8: boolean = false): SynUnicode;
|
|
|
|
|
|
{ *************** Low-Level String Conversion Functions }
|
|
|
|
/// will fast replace all #0 chars as ~
|
|
// - could be used after UniqueRawUtf8() on a in-placed modified JSON buffer,
|
|
// in which all values have been ended with #0
|
|
// - you can optionally specify a maximum size, in bytes (this won't reallocate
|
|
// the string, but just add a #0 at some point in the UTF-8 buffer)
|
|
// - could allow logging of parsed input e.g. after an exception
|
|
procedure UniqueRawUtf8ZeroToTilde(var u: RawUtf8; MaxSize: PtrInt = maxInt);
|
|
|
|
/// convert a binary buffer into a fake ASCII/UTF-8 content without any #0 input
|
|
// - will use ~ char to escape any #0 as ~0 pair (and plain ~ as ~~ pair)
|
|
// - output is just a bunch of non 0 bytes, so not trully valid UTF-8 content
|
|
// - may be used as an alternative to Base64 encoding if 8-bit chars are allowed
|
|
// - call ZeroedRawUtf8() as reverse function
|
|
function UnZeroed(const bin: RawByteString): RawUtf8;
|
|
|
|
/// convert a fake UTF-8 buffer without any #0 input back into its original binary
|
|
// - may be used as an alternative to Base64 decoding if 8-bit chars are allowed
|
|
// - call UnZeroedRawUtf8() as reverse function
|
|
function Zeroed(const u: RawUtf8): RawByteString;
|
|
|
|
/// conversion of a wide char into a WinAnsi (CodePage 1252) char
|
|
// - return '?' for an unknown WideChar in code page 1252
|
|
function WideCharToWinAnsiChar(wc: cardinal): AnsiChar;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// conversion of a wide char into a WinAnsi (CodePage 1252) char index
|
|
// - return -1 for an unknown WideChar in code page 1252
|
|
function WideCharToWinAnsi(wc: cardinal): integer;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// return TRUE if the supplied unicode buffer only contains WinAnsi characters
|
|
// - i.e. if the text can be displayed using ANSI_CHARSET
|
|
function IsWinAnsi(WideText: PWideChar): boolean; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// return TRUE if the supplied unicode buffer only contains WinAnsi characters
|
|
// - i.e. if the text can be displayed using ANSI_CHARSET
|
|
function IsWinAnsi(WideText: PWideChar; Length: integer): boolean; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// return TRUE if the supplied UTF-8 buffer only contains WinAnsi characters
|
|
// - i.e. if the text can be displayed using ANSI_CHARSET
|
|
function IsWinAnsiU(Utf8Text: PUtf8Char): boolean;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// return TRUE if the supplied UTF-8 buffer only contains WinAnsi 8-bit characters
|
|
// - i.e. if the text can be displayed using ANSI_CHARSET with only 8-bit unicode
|
|
// characters (e.g. no "tm" or such)
|
|
function IsWinAnsiU8Bit(Utf8Text: PUtf8Char): boolean;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// direct conversion of an AnsiString with an unknown code page into an
|
|
// UTF-8 encoded String
|
|
// - will assume CurrentAnsiConvert.CodePage prior to Delphi 2009
|
|
// - newer UNICODE versions of Delphi will retrieve the code page from string
|
|
procedure AnyAnsiToUtf8(const s: RawByteString; var result: RawUtf8); overload;
|
|
|
|
/// direct conversion of an AnsiString with an unknown code page into an
|
|
// UTF-8 encoded String
|
|
// - will assume CurrentAnsiConvert.CodePage prior to Delphi 2009
|
|
// - newer UNICODE versions of Delphi will retrieve the code page from string
|
|
// - use AnsiToUtf8() if you want to specify the codepage
|
|
function AnyAnsiToUtf8(const s: RawByteString): RawUtf8; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert an AnsiString (of a given code page) into a UTF-8 string
|
|
// - use AnyAnsiToUtf8() if you want to use the codepage of the input string
|
|
// - wrapper around TSynAnsiConvert.Engine(CodePage).AnsiToUtf8()
|
|
function AnsiToUtf8(const Ansi: RawByteString; CodePage: integer): RawUtf8;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert an AnsiChar buffer (of a given code page) into a UTF-8 string
|
|
// - the destination code page should be supplied
|
|
// - wrapper around TSynAnsiConvert.Engine(CodePage).AnsiBufferToRawUtf8()
|
|
procedure AnsiCharToUtf8(P: PAnsiChar; L: integer; var result: RawUtf8;
|
|
CodePage: integer);
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert an AnsiString (of a given code page) into a RTL string
|
|
// - the destination code page should be supplied
|
|
// - wrapper around TSynAnsiConvert.Engine(CodePage) and string conversion
|
|
function AnsiToString(const Ansi: RawByteString; CodePage: integer): string;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// direct conversion of a WinAnsi (CodePage 1252) string into a UTF-8 encoded String
|
|
// - faster than SysUtils: don't use Utf8Encode(WideString) -> no Windows.Global(),
|
|
// and use a fixed pre-calculated array for individual chars conversion
|
|
function WinAnsiToUtf8(const S: WinAnsiString): RawUtf8; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// direct conversion of a WinAnsi (CodePage 1252) string into a UTF-8 encoded String
|
|
// - faster than SysUtils: don't use Utf8Encode(WideString) -> no Windows.Global(),
|
|
// and use a fixed pre-calculated array for individual chars conversion
|
|
function WinAnsiToUtf8(WinAnsi: PAnsiChar; WinAnsiLen: PtrInt): RawUtf8; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// direct conversion of a WinAnsi PAnsiChar buffer into a UTF-8 encoded buffer
|
|
// - Dest^ buffer must be reserved with at least SourceChars*3
|
|
// - call internally WinAnsiConvert fast conversion class
|
|
function WinAnsiBufferToUtf8(Dest: PUtf8Char;
|
|
Source: PAnsiChar; SourceChars: cardinal): PUtf8Char;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// direct conversion of a WinAnsi ShortString into a UTF-8 text
|
|
// - call internally WinAnsiConvert fast conversion class
|
|
function ShortStringToUtf8(const source: ShortString): RawUtf8;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// direct conversion of a WinAnsi (CodePage 1252) string into a Unicode buffer
|
|
// - very fast, by using a fixed pre-calculated array for individual chars conversion
|
|
// - text will be truncated if necessary to avoid buffer overflow in Dest[]
|
|
procedure WinAnsiToUnicodeBuffer(const S: WinAnsiString;
|
|
Dest: PWordArray; DestLen: PtrInt);
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// direct conversion of a UTF-8 encoded string into a WinAnsi String
|
|
function Utf8ToWinAnsi(const S: RawUtf8): WinAnsiString; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// direct conversion of a UTF-8 encoded zero terminated buffer into a WinAnsi String
|
|
function Utf8ToWinAnsi(P: PUtf8Char): WinAnsiString; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// direct conversion of a UTF-8 encoded zero terminated buffer into a RawUtf8 String
|
|
procedure Utf8ToRawUtf8(P: PUtf8Char; var result: RawUtf8);
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// direct conversion of a UTF-8 encoded buffer into a WinAnsi PAnsiChar buffer
|
|
function Utf8ToWinPChar(dest: PAnsiChar; source: PUtf8Char; count: integer): integer;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
{$ifndef PUREMORMOT2}
|
|
/// direct conversion of a WinAnsi (CodePage 1252) string into a Unicode encoded String
|
|
// - very fast, by using a fixed pre-calculated array for individual chars conversion
|
|
function WinAnsiToRawUnicode(const S: WinAnsiString): RawUnicode;
|
|
|
|
/// convert a UTF-16 string into a WinAnsi (code page 1252) string
|
|
function RawUnicodeToWinAnsi(const Unicode: RawUnicode): WinAnsiString; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert a UTF-8 encoded buffer into a RawUnicode string
|
|
// - if L is 0, L is computed from zero terminated P buffer
|
|
// - RawUnicode is ended by a WideChar(#0)
|
|
// - faster than System.Utf8Decode() which uses slow widestrings
|
|
function Utf8DecodeToRawUnicode(P: PUtf8Char; L: integer): RawUnicode; overload;
|
|
|
|
/// convert a UTF-8 string into a RawUnicode string
|
|
function Utf8DecodeToRawUnicode(const S: RawUtf8): RawUnicode; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert a UTF-8 string into a RawUnicode string
|
|
// - this version doesn't resize the length of the result RawUnicode
|
|
// and is therefore useful before a Win32 Unicode API call (with nCount=-1)
|
|
// - if DestLen is not nil, the resulting length (in bytes) will be stored within
|
|
// - see also Utf8DecodeToUnicode() which uses a TSynTempBuffer for storage
|
|
function Utf8DecodeToRawUnicodeUI(const S: RawUtf8;
|
|
DestLen: PInteger = nil): RawUnicode; overload;
|
|
|
|
/// convert a UTF-8 string into a RawUnicode string
|
|
// - returns the resulting length (in bytes) will be stored within Dest
|
|
// - see also Utf8DecodeToUnicode() which uses a TSynTempBuffer for storage
|
|
function Utf8DecodeToRawUnicodeUI(const S: RawUtf8;
|
|
var Dest: RawUnicode): integer; overload;
|
|
|
|
/// convert a RawUnicode string into a UTF-8 string
|
|
function RawUnicodeToUtf8(const Unicode: RawUnicode): RawUtf8; overload;
|
|
|
|
/// convert any RawUnicode String into a generic SynUnicode Text
|
|
function RawUnicodeToSynUnicode(const Unicode: RawUnicode): SynUnicode; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any RTL string into a RawUnicode encoded String
|
|
// - it's prefered to use TLanguageFile.StringToUtf8() method in mORMoti18n,
|
|
// which will handle full i18n of your application
|
|
// - it will work as is with Delphi 2009+ (direct unicode conversion)
|
|
// - under older version of Delphi (no unicode), it will use the
|
|
// current RTL codepage, as with WideString conversion (but without slow
|
|
// WideString usage)
|
|
function StringToRawUnicode(const S: string): RawUnicode; overload;
|
|
|
|
/// convert any RTL string into a RawUnicode encoded String
|
|
// - it's prefered to use TLanguageFile.StringToUtf8() method in mORMoti18n,
|
|
// which will handle full i18n of your application
|
|
// - it will work as is with Delphi 2009+ (direct unicode conversion)
|
|
// - under older version of Delphi (no unicode), it will use the
|
|
// current RTL codepage, as with WideString conversion (but without slow
|
|
// WideString usage)
|
|
function StringToRawUnicode(P: PChar; L: integer): RawUnicode; overload;
|
|
|
|
/// convert any RawUnicode encoded string into a RTL string
|
|
// - uses StrLenW() and not length(U) to handle case when was used as buffer
|
|
function RawUnicodeToString(const U: RawUnicode): string; overload;
|
|
{$endif PUREMORMOT2}
|
|
|
|
/// convert a SynUnicode string into a UTF-8 string
|
|
function SynUnicodeToUtf8(const Unicode: SynUnicode): RawUtf8;
|
|
|
|
/// convert a WideString into a UTF-8 string
|
|
function WideStringToUtf8(const aText: WideString): RawUtf8;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// direct conversion of a UTF-16 encoded buffer into a WinAnsi PAnsiChar buffer
|
|
procedure RawUnicodeToWinPChar(dest: PAnsiChar;
|
|
source: PWideChar; WideCharCount: integer);
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert a UTF-16 PWideChar buffer into a WinAnsi (code page 1252) string
|
|
function RawUnicodeToWinAnsi(
|
|
WideChar: PWideChar; WideCharCount: integer): WinAnsiString; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert a WideString into a WinAnsi (code page 1252) string
|
|
function WideStringToWinAnsi(const Wide: WideString): WinAnsiString;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any UTF-16 buffer into a generic SynUnicode Text
|
|
function RawUnicodeToSynUnicode(
|
|
WideChar: PWideChar; WideCharCount: integer): SynUnicode; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert an Unicode buffer into a WinAnsi (code page 1252) string
|
|
procedure UnicodeBufferToWinAnsi(source: PWideChar; out Dest: WinAnsiString);
|
|
|
|
/// convert an Unicode buffer into a RTL string
|
|
function UnicodeBufferToString(source: PWideChar): string;
|
|
|
|
/// convert an Unicode buffer into a UTF-8 string
|
|
function UnicodeBufferToUtf8(source: PWideChar): RawUtf8;
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
|
|
/// convert an Unicode buffer into a variant storing a UTF-8 string
|
|
// - could be used e.g. as TDocVariantData.AddValue() parameter
|
|
function UnicodeBufferToVariant(source: PWideChar): variant;
|
|
|
|
/// convert any RTL string into a variant storing a UTF-8 string
|
|
// - could be used e.g. as TDocVariantData.AddValue() parameter
|
|
function StringToVariant(const Txt: string): variant; overload;
|
|
|
|
/// convert any RTL string into a variant storing a UTF-8 string
|
|
// - could be used e.g. as TDocVariantData.AddValue() parameter
|
|
procedure StringToVariant(const Txt: string; var result: variant); overload;
|
|
|
|
{$ifdef HASVARUSTRING}
|
|
|
|
/// convert a Delphi 2009+ or FPC Unicode string into our UTF-8 string
|
|
function UnicodeStringToUtf8(const S: UnicodeString): RawUtf8; inline;
|
|
|
|
// this function is the same as direct RawUtf8=AnsiString(CP_UTF8) assignment
|
|
// but is faster, since it uses no Win32 API call
|
|
function Utf8DecodeToUnicodeString(const S: RawUtf8): UnicodeString; overload; inline;
|
|
|
|
/// convert an UTF-8 encoded buffer into a Delphi 2009+ or FPC Unicode string
|
|
// - this function is the same as direct assignment, since RawUtf8=AnsiString(CP_UTF8),
|
|
// but is faster, since use no Win32 API call
|
|
procedure Utf8DecodeToUnicodeString(P: PUtf8Char; L: integer;
|
|
var result: UnicodeString); overload;
|
|
|
|
/// convert a Delphi 2009+ or FPC Unicode string into a WinAnsi (code page 1252) string
|
|
function UnicodeStringToWinAnsi(const S: UnicodeString): WinAnsiString; inline;
|
|
|
|
/// convert our UTF-8 encoded buffer into a Delphi 2009+ or FPC Unicode string
|
|
// - this function is the same as direct assignment, since RawUtf8=AnsiString(CP_UTF8),
|
|
// but is faster, since use no Win32 API call
|
|
function Utf8DecodeToUnicodeString(P: PUtf8Char; L: integer): UnicodeString; overload; inline;
|
|
|
|
/// convert a Win-Ansi encoded buffer into a Delphi 2009+ or FPC Unicode string
|
|
// - this function is faster than default RTL, since use no Win32 API call
|
|
function WinAnsiToUnicodeString(WinAnsi: PAnsiChar; WinAnsiLen: PtrInt): UnicodeString; overload;
|
|
|
|
/// convert a Win-Ansi string into a Delphi 2009+ or FPC Unicode string
|
|
// - this function is faster than default RTL, since use no Win32 API call
|
|
function WinAnsiToUnicodeString(const WinAnsi: WinAnsiString): UnicodeString; inline; overload;
|
|
|
|
{$endif HASVARUSTRING}
|
|
|
|
/// convert an UTF-8 encoded buffer into a UTF-16 encoded RawByteString buffer
|
|
// - could be used instead of deprecated RawUnicode when a temp UTF-16 buffer is needed
|
|
function Utf8DecodeToUnicodeRawByteString(P: PUtf8Char; L: integer): RawByteString; overload;
|
|
|
|
/// convert an UTF-8 encoded buffer into a UTF-16 encoded RawByteString buffer
|
|
// - could be used instead of deprecated RawUnicode when a temp UTF-16 buffer is needed
|
|
function Utf8DecodeToUnicodeRawByteString(const U: RawUtf8): RawByteString; overload;
|
|
|
|
/// convert an UTF-8 encoded buffer into a UTF-16 encoded stream of bytes
|
|
function Utf8DecodeToUnicodeStream(P: PUtf8Char; L: integer): TStream;
|
|
|
|
/// convert a Win-Ansi encoded buffer into a Delphi 2009+ or FPC Unicode string
|
|
// - this function is faster than default RTL, since use no Win32 API call
|
|
function WinAnsiToSynUnicode(WinAnsi: PAnsiChar; WinAnsiLen: PtrInt): SynUnicode; overload;
|
|
|
|
/// convert a Win-Ansi string into a Delphi 2009+ or FPC Unicode string
|
|
// - this function is faster than default RTL, since use no Win32 API call
|
|
function WinAnsiToSynUnicode(const WinAnsi: WinAnsiString): SynUnicode;
|
|
{$ifdef HASINLINE}inline;{$endif} overload;
|
|
|
|
/// convert any RTL string into an UTF-8 encoded String
|
|
// - in the VCL context, it's prefered to use TLanguageFile.StringToUtf8()
|
|
// method from mORMoti18n, which will handle full i18n of your application
|
|
// - it will work as is with Delphi 2009+ (direct unicode conversion)
|
|
// - under older version of Delphi (no unicode), it will use the
|
|
// current RTL codepage, as with WideString conversion (but without slow
|
|
// WideString usage)
|
|
function StringToUtf8(const Text: string): RawUtf8; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any RTL string buffer into an UTF-8 encoded String
|
|
// - it will work as is with Delphi 2009+ (direct unicode conversion)
|
|
// - under older version of Delphi (no unicode), it will use the
|
|
// current RTL codepage, as with WideString conversion (but without slow
|
|
// WideString usage)
|
|
procedure StringToUtf8(Text: PChar; TextLen: PtrInt; var result: RawUtf8); overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any RTL string into an UTF-8 encoded String
|
|
// - this overloaded function use a faster by-reference parameter for the result
|
|
procedure StringToUtf8(const Text: string; var result: RawUtf8); overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any RTL string into an UTF-8 encoded String
|
|
function ToUtf8(const Text: string): RawUtf8; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any RTL string into an UTF-8 encoded TSynTempBuffer
|
|
// - returns the number of UTF-8 bytes available in Temp.buf
|
|
// - this overloaded function use a TSynTempBuffer for the result to avoid any
|
|
// memory allocation for the shorter content
|
|
// - caller should call Temp.Done to release any heap-allocated memory
|
|
function StringToUtf8(const Text: string; var Temp: TSynTempBuffer): integer; overload;
|
|
|
|
/// convert any Ansi memory buffer into UTF-8, using a TSynTempBuffer if needed
|
|
// - caller should release any memory by calling Temp.Done
|
|
// - returns a pointer to the UTF-8 converted buffer - which may be buf
|
|
function AnsiBufferToTempUtf8(var Temp: TSynTempBuffer;
|
|
Buf: PAnsiChar; BufLen, CodePage: cardinal): PUtf8Char;
|
|
|
|
/// convert any UTF-8 encoded ShortString Text into an UTF-8 encoded String
|
|
// - expects the supplied content to be already ASCII-7 or UTF-8 encoded, e.g.
|
|
// a RTTI type or property name: it won't work with Ansi-encoded strings
|
|
function ToUtf8(const Ansi7Text: ShortString): RawUtf8; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any RTL string buffer into an UTF-8 encoded buffer
|
|
// - Dest must be able to receive at least SourceChars*3 bytes
|
|
// - it will work as is with Delphi 2009+ (direct unicode conversion)
|
|
// - under older version of Delphi (no unicode), it will use the
|
|
// current RTL codepage, as with WideString conversion (but without slow
|
|
// WideString usage)
|
|
function StringBufferToUtf8(Dest: PUtf8Char;
|
|
Source: PChar; SourceChars: PtrInt): PUtf8Char; overload;
|
|
|
|
/// convert any RTL string 0-terminated Text buffer into an UTF-8 string
|
|
// - it will work as is with Delphi 2009+ (direct unicode conversion)
|
|
// - under older version of Delphi (no unicode), it will use the
|
|
// current RTL codepage, as with WideString conversion (but without slow
|
|
// WideString usage)
|
|
procedure StringBufferToUtf8(Source: PChar; out result: RawUtf8); overload;
|
|
|
|
/// convert any RTL string into a SynUnicode encoded String
|
|
// - it's prefered to use TLanguageFile.StringToUtf8() method in mORMoti18n,
|
|
// which will handle full i18n of your application
|
|
// - it will work as is with Delphi 2009+ (direct unicode conversion)
|
|
// - under older version of Delphi (no unicode), it will use the
|
|
// current RTL codepage, as with WideString conversion (but without slow
|
|
// WideString usage)
|
|
function StringToSynUnicode(const S: string): SynUnicode; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any RTL string into a SynUnicode encoded String
|
|
// - overloaded to avoid a copy to a temporary result string of a function
|
|
procedure StringToSynUnicode(const S: string; var result: SynUnicode); overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any UTF-16 encoded buffer into a RTL string
|
|
function RawUnicodeToString(P: PWideChar; L: integer): string; overload;
|
|
|
|
/// convert any UTF-16 encoded buffer into a RTL string
|
|
procedure RawUnicodeToString(P: PWideChar; L: integer; var result: string); overload;
|
|
|
|
/// convert any SynUnicode encoded string into a RTL string
|
|
function SynUnicodeToString(const U: SynUnicode): string;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any UTF-8 encoded String into a RTL string
|
|
// - it's prefered to use TLanguageFile.Utf8ToString() in mORMoti18n,
|
|
// which will handle full i18n of your application
|
|
// - it will work as is with Delphi 2009+ (direct unicode conversion)
|
|
// - under older version of Delphi (no unicode), it will use the
|
|
// current RTL codepage, as with WideString conversion (but without slow
|
|
// WideString usage)
|
|
function Utf8ToString(const Text: RawUtf8): string;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any UTF-8 encoded String into a RTL string
|
|
procedure Utf8ToStringVar(const Text: RawUtf8; var result: string);
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any UTF-8 encoded String into a generic RTL file name string
|
|
procedure Utf8ToFileName(const Text: RawUtf8; var result: TFileName);
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any UTF-8 encoded buffer into a RTL string
|
|
// - it's prefered to use TLanguageFile.Utf8ToString() in mORMoti18n,
|
|
// which will handle full i18n of your application
|
|
// - it will work as is with Delphi 2009+ (direct unicode conversion)
|
|
// - under older version of Delphi (no unicode), it will use the
|
|
// current RTL codepage, as with WideString conversion (but without slow
|
|
// WideString usage)
|
|
function Utf8DecodeToString(P: PUtf8Char; L: integer): string; overload;
|
|
{$ifdef UNICODE}inline;{$endif}
|
|
|
|
/// convert any UTF-8 encoded buffer into a RTL string
|
|
procedure Utf8DecodeToString(P: PUtf8Char; L: integer; var result: string); overload;
|
|
|
|
/// convert any UTF-8 encoded String into a generic WideString Text
|
|
function Utf8ToWideString(const Text: RawUtf8): WideString; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any UTF-8 encoded String into a generic WideString Text
|
|
procedure Utf8ToWideString(const Text: RawUtf8; var result: WideString); overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any UTF-8 encoded String into a generic WideString Text
|
|
procedure Utf8ToWideString(Text: PUtf8Char; Len: PtrInt; var result: WideString); overload;
|
|
|
|
/// convert any UTF-8 encoded String into a generic SynUnicode Text
|
|
function Utf8ToSynUnicode(const Text: RawUtf8): SynUnicode; overload;
|
|
|
|
/// convert any UTF-8 encoded String into a generic SynUnicode Text
|
|
procedure Utf8ToSynUnicode(const Text: RawUtf8; var result: SynUnicode); overload;
|
|
|
|
/// convert any UTF-8 encoded buffer into a generic SynUnicode Text
|
|
procedure Utf8ToSynUnicode(Text: PUtf8Char; Len: PtrInt; var result: SynUnicode); overload;
|
|
|
|
/// convert any UTF-8 encoded string into an UTF-16 temporary buffer
|
|
// - returns the number of WideChar stored in temp (not bytes)
|
|
// - caller should make temp.Done after temp.buf has been used
|
|
function Utf8DecodeToUnicode(const Text: RawUtf8; var temp: TSynTempBuffer): PtrInt; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any UTF-8 encoded buffer into an UTF-16 temporary buffer
|
|
function Utf8DecodeToUnicode(Text: PUtf8Char; Len: PtrInt; var temp: TSynTempBuffer): PtrInt; overload;
|
|
|
|
/// convert any Ansi 7-bit encoded String into a RTL string
|
|
// - the Text content must contain only 7-bit pure ASCII characters
|
|
function Ansi7ToString(const Text: RawByteString): string; overload;
|
|
{$ifndef UNICODE}{$ifdef HASINLINE}inline;{$endif}{$endif}
|
|
|
|
/// convert any Ansi 7-bit encoded String into a RTL string
|
|
// - the Text content must contain only 7-bit pure ASCII characters
|
|
function Ansi7ToString(Text: PWinAnsiChar; Len: PtrInt): string; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// convert any Ansi 7-bit encoded String into a RTL string
|
|
// - the Text content must contain only 7-bit pure ASCII characters
|
|
procedure Ansi7ToString(Text: PWinAnsiChar; Len: PtrInt; var result: string); overload;
|
|
|
|
/// convert any RTL string into Ansi 7-bit encoded String
|
|
// - the Text content must contain only 7-bit pure ASCII characters
|
|
function StringToAnsi7(const Text: string): RawByteString;
|
|
{$ifndef UNICODE}{$ifdef HASINLINE}inline;{$endif}{$endif}
|
|
|
|
/// convert any RTL string into WinAnsi (Win-1252) 8-bit encoded String
|
|
function StringToWinAnsi(const Text: string): WinAnsiString;
|
|
{$ifdef UNICODE}inline;{$endif}
|
|
|
|
|
|
|
|
{ **************** Text Case-(in)sensitive Conversion and Comparison }
|
|
|
|
type
|
|
/// lookup table used for fast case conversion
|
|
TNormTable = packed array[AnsiChar] of AnsiChar;
|
|
/// pointer to a lookup table used for fast case conversion
|
|
PNormTable = ^TNormTable;
|
|
|
|
/// lookup table used for fast case conversion
|
|
TNormTableByte = packed array[byte] of byte;
|
|
/// pointer to a lookup table used for fast case conversion
|
|
PNormTableByte = ^TNormTableByte;
|
|
|
|
var
|
|
/// lookup table used for fast case conversion to uppercase
|
|
// - handle 8-bit upper chars as in WinAnsi / code page 1252 (e.g. accents)
|
|
// - is defined globally, since may be used from an inlined function
|
|
NormToUpper: TNormTable;
|
|
NormToUpperByte: TNormTableByte absolute NormToUpper;
|
|
|
|
/// lookup table used for fast case conversion to lowercase
|
|
// - handle 8-bit upper chars as in WinAnsi / code page 1252 (e.g. accents)
|
|
// - is defined globally, since may be used from an inlined function
|
|
NormToLower: TNormTable;
|
|
NormToLowerByte: TNormTableByte absolute NormToLower;
|
|
|
|
/// this table will convert 'a'..'z' into 'A'..'Z'
|
|
// - so it will work with UTF-8 without decoding, whereas NormToUpper[] expects
|
|
// WinAnsi encoding
|
|
NormToUpperAnsi7: TNormTable;
|
|
NormToUpperAnsi7Byte: TNormTableByte absolute NormToUpperAnsi7;
|
|
|
|
/// this table will convert 'A'..'Z' into 'a'..'z'
|
|
// - so it will work with UTF-8 without decoding, whereas NormToUpper[] expects
|
|
// WinAnsi encoding
|
|
NormToLowerAnsi7: TNormTable;
|
|
NormToLowerAnsi7Byte: TNormTableByte absolute NormToLowerAnsi7;
|
|
|
|
/// case sensitive NormToUpper[]/NormToLower[]-like table
|
|
// - i.e. NormToNorm[c] = c
|
|
NormToNorm: TNormTable;
|
|
NormToNormByte: TNormTableByte absolute NormToNorm;
|
|
|
|
const
|
|
NORM2CASE: array[boolean] of PNormTable = (nil, @NormToUpperAnsi7);
|
|
|
|
type
|
|
/// character categories for text linefeed/word/identifier/uri parsing
|
|
// - using such a set compiles into TEST [MEM], IMM so is more efficient
|
|
// than a regular set of AnsiChar which generates much slower BT [MEM], IMM
|
|
// - the same 256-byte memory will also be reused from L1 CPU cache
|
|
// during the parsing of complex input
|
|
TTextChar = set of (
|
|
tcNot01013,
|
|
tc1013,
|
|
tcCtrlNotLF,
|
|
tcCtrlNot0Comma,
|
|
tcWord,
|
|
tcIdentifierFirstChar,
|
|
tcIdentifier,
|
|
tcUriUnreserved);
|
|
|
|
/// defines an AnsiChar lookup table used for branch-less text parsing
|
|
TTextCharSet = array[AnsiChar] of TTextChar;
|
|
/// points to an AnsiChar lookup table used for branch-less text parsing
|
|
PTextCharSet = ^TTextCharSet;
|
|
|
|
/// defines an Ordinal lookup table used for branch-less text parsing
|
|
TTextByteSet = array[byte] of TTextChar;
|
|
/// points to an Ordinal lookup table used for branch-less text parsing
|
|
PTextByteSet = ^TTextByteSet;
|
|
|
|
var
|
|
/// lookup table for text linefeed/word/identifier/uri branch-less parsing
|
|
TEXT_CHARS: TTextCharSet;
|
|
TEXT_BYTES: TTextByteSet absolute TEXT_CHARS;
|
|
|
|
/// returns TRUE if the given text buffer contains a..z,A..Z,0..9,_ characters
|
|
// - should match most usual property names values or other identifier names
|
|
// in the business logic source code
|
|
// - i.e. can be tested via IdemPropName*() functions, and the MongoDB-like
|
|
// extended JSON syntax as generated by dvoSerializeAsExtendedJson
|
|
// - following classic pascal naming convention, first char must be alphabetical
|
|
// or '_' (i.e. not a digit), following chars can be alphanumerical or '_'
|
|
function PropNameValid(P: PUtf8Char): boolean;
|
|
|
|
/// returns TRUE if the given text buffers contains A..Z,0..9,_ characters
|
|
// - use it with property names values (i.e. only including A..Z,0..9,_ chars)
|
|
// - this function allows numbers as first char, so won't check the first char
|
|
// the same way than PropNameValid() which refuses digits as pascal convention
|
|
function PropNamesValid(const Values: array of RawUtf8): boolean;
|
|
|
|
/// try to generate a PropNameValid() output from an incoming text
|
|
// - will trim all spaces, and replace most special chars by '_'
|
|
// - if it is not PropNameValid() after those replacements, will return fallback
|
|
function PropNameSanitize(const text, fallback: RawUtf8): RawUtf8;
|
|
|
|
/// case insensitive comparison of ASCII 7-bit identifiers
|
|
// - use it with property names values (i.e. only including A..Z,0..9,_ chars)
|
|
// - behavior is undefined with UTF-8 encoding (some false positive may occur)
|
|
function IdemPropName(const P1, P2: ShortString): boolean; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// case insensitive comparison of ASCII 7-bit identifiers
|
|
// - use it with property names values (i.e. only including A..Z,0..9,_ chars)
|
|
// - behavior is undefined with UTF-8 encoding (some false positive may occur)
|
|
function IdemPropName(const P1: ShortString; P2: PUtf8Char; P2Len: PtrInt): boolean; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// case insensitive comparison of ASCII 7-bit identifiers
|
|
// - use it with property names values (i.e. only including A..Z,0..9,_ chars)
|
|
// - behavior is undefined with UTF-8 encoding (some false positive may occur)
|
|
// - this version expects P1 and P2 to be a PAnsiChar with specified lengths
|
|
function IdemPropName(P1, P2: PUtf8Char; P1Len, P2Len: PtrInt): boolean; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// case insensitive comparison of ASCII 7-bit identifiers
|
|
// - use it with property names values (i.e. only including A..Z,0..9,_ chars)
|
|
// - behavior is undefined with UTF-8 encoding (some false positive may occur)
|
|
// - this version expects P2 to be a PAnsiChar with specified length
|
|
function IdemPropNameU(const P1: RawUtf8; P2: PUtf8Char; P2Len: PtrInt): boolean; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// case insensitive comparison of ASCII 7-bit identifiers of same length
|
|
// - use it with property names values (i.e. only including A..Z,0..9,_ chars)
|
|
// - behavior is undefined with UTF-8 encoding (some false positive may occur)
|
|
// - this version expects P1 and P2 to be a PAnsiChar with an already checked
|
|
// identical length, so may be used for a faster process, e.g. in a loop
|
|
// - if P1 and P2 are RawUtf8, you should better call overloaded function
|
|
// IdemPropNameU(const P1,P2: RawUtf8), which would be slightly faster by
|
|
// using the length stored before the actual text buffer of each RawUtf8
|
|
function IdemPropNameUSameLenNotNull(P1, P2: PUtf8Char; P1P2Len: PtrInt): boolean;
|
|
{$ifdef FPC}inline;{$endif} // Delphi does not like to inline goto
|
|
|
|
type
|
|
TIdemPropNameUSameLen = function(P1, P2: pointer; P1P2Len: PtrInt): boolean;
|
|
|
|
var
|
|
/// case (in)sensitive comparison of ASCII 7-bit identifiers of same length
|
|
IdemPropNameUSameLen: array[{casesensitive=}boolean] of TIdemPropNameUSameLen;
|
|
|
|
/// case insensitive comparison of ASCII 7-bit identifiers
|
|
// - use it with property names values (i.e. only including A..Z,0..9,_ chars)
|
|
// - behavior is undefined with UTF-8 encoding (some false positive may occur)
|
|
// - is an alternative with PropNameEquals() to be used inlined e.g. in a loop
|
|
function IdemPropNameU(const P1, P2: RawUtf8): boolean; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// returns true if the beginning of p^ is the same as up^
|
|
// - ignore case - up^ must be already Upper
|
|
// - chars are compared as 7-bit Ansi only (no accentuated characters): but when
|
|
// you only need to search for field names e.g. IdemPChar() is prefered, because
|
|
// it'll be faster than IdemPCharU(), if UTF-8 decoding is not mandatory
|
|
// - if p is nil, will return FALSE
|
|
// - if up is nil, will return TRUE
|
|
function IdemPChar(p: PUtf8Char; up: PAnsiChar): boolean; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// returns true if the beginning of p^ is the same as up^
|
|
// - this overloaded function accept the uppercase lookup buffer as parameter
|
|
function IdemPChar(p: PUtf8Char; up: PAnsiChar; table: PNormTable): boolean; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// returns true if the beginning of p^ is the same as up^, ignoring white spaces
|
|
// - ignore case - up^ must be already Upper
|
|
// - any white space in the input p^ buffer is just ignored
|
|
// - chars are compared as 7-bit Ansi only (no accentuated characters): but when
|
|
// you only need to search for field names e.g. IdemPChar() is prefered, because
|
|
// it'll be faster than IdemPCharU(), if UTF-8 decoding is not mandatory
|
|
// - if p is nil, will return FALSE
|
|
// - if up is nil, will return TRUE
|
|
function IdemPCharWithoutWhiteSpace(p: PUtf8Char; up: PAnsiChar): boolean;
|
|
|
|
/// returns the index of a matching beginning of p^ in upArray[]
|
|
// - returns -1 if no item matched
|
|
// - ignore case - upArray^ must be already Upper
|
|
// - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8)
|
|
// - warning: this function expects upArray[] items to have AT LEAST TWO
|
|
// CHARS (it will use a fast 16-bit comparison of initial 2 bytes)
|
|
// - consider IdemPPChar() which is faster but a bit more verbose
|
|
function IdemPCharArray(p: PUtf8Char; const upArray: array of PAnsiChar): integer;
|
|
|
|
/// returns the index of a matching beginning of p^ in nil-terminated up^ array
|
|
// - returns -1 if no item matched
|
|
// - ignore case - each up^ must be already Upper
|
|
// - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8)
|
|
// - warning: this function expects up^ items to have AT LEAST TWO CHARS
|
|
// (it will use a fast 16-bit comparison of initial 2 bytes)
|
|
function IdemPPChar(p: PUtf8Char; up: PPAnsiChar): PtrInt;
|
|
|
|
/// returns the index of a matching beginning of p^ in upArray two characters
|
|
// - returns -1 if no item matched
|
|
// - ignore case - upArray^ must be already Upper
|
|
// - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8)
|
|
function IdemPCharArrayBy2(p: PUtf8Char; const upArrayBy2Chars: RawUtf8): PtrInt;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// returns true if the beginning of p^ is the same as up^
|
|
// - ignore case - up^ must be already Upper
|
|
// - this version will decode the UTF-8 content before using NormToUpper[], so
|
|
// it will be slower than the IdemPChar() function above, but will handle
|
|
// WinAnsi accentuated characters (e.g. 'e' acute will be matched as 'E')
|
|
function IdemPCharU(p, up: PUtf8Char): boolean;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// returns true if the beginning of p^ is same as up^
|
|
// - ignore case - up^ must be already Upper
|
|
// - this version expects p^ to point to an Unicode char array
|
|
function IdemPCharW(p: PWideChar; up: PUtf8Char): boolean;
|
|
|
|
/// check case-insensitive matching starting of text in upTextStart
|
|
// - returns true if the item matched
|
|
// - ignore case - upTextStart must be already in upper case
|
|
// - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8)
|
|
// - see StartWithExact() from mormot.core.text for a case-sensitive version
|
|
function StartWith(const text, upTextStart: RawUtf8): boolean;
|
|
|
|
/// check case-insensitive matching ending of text in upTextEnd
|
|
// - returns true if the item matched
|
|
// - ignore case - upTextEnd must be already in upper case
|
|
// - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8)
|
|
// - see EndWithExact() from mormot.core.text for a case-sensitive version
|
|
function EndWith(const text, upTextEnd: RawUtf8): boolean;
|
|
|
|
/// returns the index of a case-insensitive matching ending of p^ in upArray[]
|
|
// - returns -1 if no item matched
|
|
// - ignore case - upArray[] items must be already in upper case
|
|
// - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8)
|
|
function EndWithArray(const text: RawUtf8; const upArray: array of RawUtf8): integer;
|
|
|
|
/// returns true if the file name extension contained in p^ is the same same as extup^
|
|
// - ignore case - extup^ must be already Upper
|
|
// - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8)
|
|
// - could be used e.g. like IdemFileExt(aFileName,'.JP');
|
|
function IdemFileExt(p: PUtf8Char; extup: PAnsiChar; sepChar: AnsiChar = '.'): boolean;
|
|
|
|
/// returns matching file name extension index as extup^
|
|
// - ignore case - extup[] must be already Upper
|
|
// - chars are compared as 7-bit Ansi only (no accentuated chars, nor UTF-8)
|
|
// - could be used e.g. like IdemFileExts(aFileName,['.PAS','.INC']);
|
|
function IdemFileExts(p: PUtf8Char; const extup: array of PAnsiChar;
|
|
sepChar: AnsiChar = '.'): integer;
|
|
|
|
/// fast retrieve the position of any value of a given set of characters
|
|
// - see also strspn() function which is likely to be faster
|
|
function PosCharAny(Str: PUtf8Char; Characters: PAnsiChar): PUtf8Char;
|
|
|
|
/// a non case-sensitive RawUtf8 version of Pos()
|
|
// - uppersubstr is expected to be already in upper case
|
|
// - this version handle only 7-bit ASCII (no accentuated characters)
|
|
// - see PosIU() if you want an UTF-8 version with accentuated chars support
|
|
function PosI(uppersubstr: PUtf8Char; const str: RawUtf8): PtrInt;
|
|
|
|
/// a non case-sensitive version of Pos()
|
|
// - uppersubstr is expected to be already in upper case
|
|
// - this version handle only 7-bit ASCII (no accentuated characters)
|
|
function StrPosI(uppersubstr, str: PUtf8Char): PUtf8Char;
|
|
|
|
/// a non case-sensitive RawUtf8 version of Pos()
|
|
// - substr is expected to be already in upper case
|
|
// - this version will decode the UTF-8 content before using NormToUpper[]
|
|
// - see PosI() for a non-accentuated, but faster version
|
|
function PosIU(substr: PUtf8Char; const str: RawUtf8): integer;
|
|
|
|
/// pure pascal version of strspn(), to be used with PUtf8Char/PAnsiChar
|
|
// - returns size of initial segment of s which appears in accept chars, e.g.
|
|
// ! strspn('abcdef','debca')=5
|
|
// - please note that this optimized version may read up to 3 bytes beyond
|
|
// accept but never after s end, so is safe e.g. over memory mapped files
|
|
function strspn(s, accept: pointer): integer;
|
|
|
|
/// pure pascal version of strcspn(), to be used with PUtf8Char/PAnsiChar
|
|
// - returns size of initial segment of s which doesn't appears in reject chars, e.g.
|
|
// ! strcspn('1234,6789',',')=4
|
|
// - please note that this optimized version may read up to 3 bytes beyond
|
|
// reject but never after s end, so is safe e.g. over memory mapped files
|
|
function strcspn(s, reject: pointer): integer;
|
|
|
|
/// our fast version of StrCompL(), to be used with PUtf8Char
|
|
// - i.e. make a binary comparison of two memory buffers, using supplied length
|
|
// - Default value is returned if both P1 and P2 buffers are equal
|
|
function StrCompL(P1, P2: pointer; L: PtrInt; Default: PtrInt = 0): PtrInt;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// our fast version of StrCompIL(), to be used with PUtf8Char
|
|
// - i.e. make a case-insensitive comparison of two memory buffers, using
|
|
// supplied length
|
|
// - Default value is returned if both P1 and P2 buffers are equal
|
|
function StrCompIL(P1, P2: pointer; L: PtrInt; Default: PtrInt = 0): PtrInt;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// our fast version of StrIComp(), to be used with PUtf8Char/PAnsiChar
|
|
function StrIComp(Str1, Str2: pointer): PtrInt;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// StrIComp-like function with a lookup table and Str1/Str2 expected not nil
|
|
function StrICompNotNil(Str1, Str2: pointer; Up: PNormTableByte): PtrInt;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// StrIComp-like function with a length, lookup table and Str1/Str2 expected not nil
|
|
function StrICompLNotNil(Str1, Str2: pointer; Up: PNormTableByte; L: PtrInt): PtrInt;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// StrIComp function with a length, lookup table and Str1/Str2 expected not nil
|
|
// - returns L for whole match, or < L for a partial match
|
|
function StrILNotNil(Str1, Str2: pointer; Up: PNormTableByte; L: PtrInt): PtrInt;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
type
|
|
/// function prototype used internally for UTF-8 buffer comparison
|
|
// - also used e.g. in mormot.core.variants unit
|
|
TUtf8Compare = function(P1, P2: PUtf8Char): PtrInt;
|
|
|
|
var
|
|
/// a quick wrapper to StrComp or StrIComp comparison functions
|
|
StrCompByCase: array[{CaseInsensitive=}boolean] of TUtf8Compare;
|
|
|
|
/// retrieve the next UCS4 CodePoint stored in U, then update the U pointer
|
|
// - this function will decode the UTF-8 content before using NormToUpper[]
|
|
// - will return '?' if the UCS4 CodePoint is higher than #255: so use this function
|
|
// only if you need to deal with ASCII characters (e.g. it's used for Soundex
|
|
// and for ContainsUtf8 function)
|
|
function GetNextUtf8Upper(var U: PUtf8Char): Ucs4CodePoint;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// points to the beginning of the next word stored in U
|
|
// - returns nil if reached the end of U (i.e. #0 char)
|
|
// - here a "word" is a Win-Ansi word, i.e. '0'..'9', 'A'..'Z'
|
|
function FindNextUtf8WordBegin(U: PUtf8Char): PUtf8Char;
|
|
|
|
/// return true if UpperValue (Ansi) is contained in A^ (Ansi)
|
|
// - find UpperValue starting at word beginning, not inside words
|
|
function FindAnsi(A, UpperValue: PAnsiChar): boolean;
|
|
|
|
/// return true if UpperValue (Ansi) is contained in U^ (UTF-8 encoded)
|
|
// - find UpperValue starting at word beginning, not inside words
|
|
// - UTF-8 decoding is done on the fly (no temporary decoding buffer is used)
|
|
function FindUtf8(U: PUtf8Char; UpperValue: PAnsiChar): boolean;
|
|
|
|
/// return true if Upper (Unicode encoded) is contained in U^ (UTF-8 encoded)
|
|
// - will use the slow but accurate Operating System API (Win32 or ICU)
|
|
// to perform the comparison at Unicode-level
|
|
// - consider using StrPosIReference() for our faster Unicode 10.0 version
|
|
function FindUnicode(PW: PWideChar; Upper: PWideChar; UpperLen: PtrInt): boolean;
|
|
|
|
/// return true if up^ is contained inside the UTF-8 buffer p^
|
|
// - search up^ at the beginning of every UTF-8 word (aka in Soundex)
|
|
// - here a "word" is a Win-Ansi word, i.e. '0'..'9', 'A'..'Z'
|
|
// - up^ must be already Upper
|
|
function ContainsUtf8(p, up: PUtf8Char): boolean;
|
|
|
|
/// returns TRUE if the supplied uppercased text is contained in the text buffer
|
|
function GetLineContains(p, pEnd, up: PUtf8Char): boolean;
|
|
{$ifdef FPC}inline;{$endif} // Delphi does not like inlining goto+label
|
|
|
|
/// copy source into a 256 chars dest^ buffer with 7-bit upper case conversion
|
|
// - used internally for short keys match or case-insensitive hash
|
|
// - returns final dest pointer
|
|
// - will copy up to 255 AnsiChar (expect the dest buffer to be defined e.g. as
|
|
// array[byte] of AnsiChar on the caller stack)
|
|
function UpperCopy255(dest: PAnsiChar; const source: RawUtf8): PAnsiChar; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// copy source^ into a 256 chars dest^ buffer with 7-bit upper case conversion
|
|
// - used internally for short keys match or case-insensitive hash
|
|
// - returns final dest pointer
|
|
// - will copy up to 255 AnsiChar (expect the dest buffer to be defined e.g. as
|
|
// array[byte] of AnsiChar on the caller stack)
|
|
function UpperCopy255Buf(dest: PAnsiChar; source: PUtf8Char; sourceLen: PtrInt): PAnsiChar;
|
|
|
|
/// copy source into dest^ with WinAnsi 8-bit upper case conversion
|
|
// - used internally for short keys match or case-insensitive hash
|
|
// - returns final dest pointer
|
|
// - will copy up to 255 AnsiChar (expect the dest buffer to be array[byte] of
|
|
// AnsiChar)
|
|
function UpperCopyWin255(dest: PWinAnsiChar; const source: RawUtf8): PWinAnsiChar;
|
|
|
|
/// copy UTF-16 source into dest^ with ASCII 7-bit upper case conversion
|
|
// - used internally for short keys match or case-insensitive hash
|
|
// - returns final dest pointer
|
|
// - will copy up to 255 AnsiChar (expect the dest buffer to be array[byte] of
|
|
// AnsiChar), replacing any non WinAnsi character by '?'
|
|
function UpperCopy255W(dest: PAnsiChar; const source: SynUnicode): PAnsiChar; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// copy WideChar source into dest^ with upper case conversion
|
|
// - used internally for short keys match or case-insensitive hash
|
|
// - returns final dest pointer
|
|
// - will copy up to 255 AnsiChar (expect the dest buffer to be array[byte] of
|
|
// AnsiChar), replacing any non WinAnsi character by '?'
|
|
function UpperCopy255W(dest: PAnsiChar; source: PWideChar; L: PtrInt): PAnsiChar; overload;
|
|
|
|
/// copy source into dest^ with ASCII 7-bit upper case conversion
|
|
// - returns final dest pointer
|
|
// - will copy up to the source buffer end: so Dest^ should be big enough -
|
|
// which will the case e.g. if Dest := pointer(source)
|
|
function UpperCopy(dest: PAnsiChar; const source: RawUtf8): PAnsiChar;
|
|
|
|
/// copy source into dest^ with ASCII 7-bit upper case conversion
|
|
// - returns final dest pointer
|
|
// - this special version expect source to be a ShortString
|
|
function UpperCopyShort(dest: PAnsiChar; const source: ShortString): PAnsiChar;
|
|
|
|
/// fast UTF-8 comparison handling WinAnsi CP-1252 case folding
|
|
// - this version expects u1 and u2 to be zero-terminated
|
|
// - decode the UTF-8 content before using NormToUpper[] lookup table
|
|
// - match the our SYSTEMNOCASE custom (and default) SQLite 3 collation
|
|
// - consider Utf8ICompReference() for Unicode 10.0 support
|
|
function Utf8IComp(u1, u2: PUtf8Char): PtrInt;
|
|
|
|
/// fast UTF-8 comparison handling WinAnsi CP-1252 case folding
|
|
// - this version expects u1 and u2 not to be necessary zero-terminated, but
|
|
// uses L1 and L2 as length for u1 and u2 respectively
|
|
// - decode the UTF-8 content before using NormToUpper[] lookup table
|
|
// - consider Utf8ILCompReference() for Unicode 10.0 support
|
|
function Utf8ILComp(u1, u2: PUtf8Char; L1, L2: cardinal): PtrInt;
|
|
|
|
/// copy UTF-8 buffer into dest^ handling WinAnsi CP-1252 NormToUpper[] folding
|
|
// - returns the final dest pointer
|
|
// - current implementation handles UTF-16 surrogates
|
|
function Utf8UpperCopy(Dest, Source: PUtf8Char; SourceChars: cardinal): PUtf8Char;
|
|
|
|
/// copy UTF-8 buffer into dest^ handling WinAnsi CP-1252 NormToUpper[] folding
|
|
// - returns the final dest pointer
|
|
// - will copy up to 255 AnsiChar (expect the dest buffer to be array[byte] of
|
|
// AnsiChar), with UTF-8 encoding
|
|
function Utf8UpperCopy255(dest: PAnsiChar; const source: RawUtf8): PUtf8Char;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// fast case-insensitive Unicode comparison handling ASCII 7-bit chars
|
|
// - use the NormToUpperAnsi7Byte[] array, i.e. compare 'a'..'z' as 'A'..'Z'
|
|
// - this version expects u1 and u2 to be zero-terminated
|
|
function AnsiICompW(u1, u2: PWideChar): PtrInt;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// compare two "array of AnsiString" elements, with no case sensitivity
|
|
// - just a wrapper around inlined StrIComp()
|
|
function SortDynArrayAnsiStringI(const A, B): integer;
|
|
|
|
/// compare two "array of PUtf8Char/PAnsiChar" elements, with no case sensitivity
|
|
// - just a wrapper around inlined StrIComp()
|
|
function SortDynArrayPUtf8CharI(const A, B): integer;
|
|
|
|
/// compare two "array of RTL string" elements, with no case sensitivity
|
|
// - the expected string type is the RTL string
|
|
// - just a wrapper around StrIComp() for AnsiString or AnsiICompW() for UNICODE
|
|
function SortDynArrayStringI(const A, B): integer;
|
|
|
|
/// compare two "array of WideString/UnicodeString" elements, with no case sensitivity
|
|
// - implemented here since would call AnsiICompW()
|
|
function SortDynArrayUnicodeStringI(const A, B): integer;
|
|
|
|
var
|
|
/// a quick wrapper to SortDynArrayAnsiString or SortDynArrayAnsiStringI
|
|
// comparison functions
|
|
SortDynArrayAnsiStringByCase: array[{CaseInsensitive=}boolean] of TDynArraySortCompare;
|
|
|
|
/// SameText() overloaded function with proper UTF-8 decoding
|
|
// - fast version using NormToUpper[] array for all WinAnsi characters
|
|
// - this version will decode each UTF-8 glyph before using NormToUpper[]
|
|
// - current implementation handles UTF-16 surrogates as Utf8IComp()
|
|
function SameTextU(const S1, S2: RawUtf8): boolean;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// fast conversion of the supplied text into 8-bit uppercase
|
|
// - this will not only convert 'a'..'z' into 'A'..'Z', but also accentuated
|
|
// latin characters ('e' acute into 'E' e.g.), using NormToUpper[] array
|
|
// - it will therefore decode the supplied UTF-8 content to handle more than
|
|
// 7-bit of ascii characters (so this function is dedicated to WinAnsi code page
|
|
// 1252 characters set)
|
|
function UpperCaseU(const S: RawUtf8): RawUtf8;
|
|
|
|
/// fast conversion of the supplied text into 8-bit lowercase
|
|
// - this will not only convert 'A'..'Z' into 'a'..'z', but also accentuated
|
|
// latin characters ('E' acute into 'e' e.g.), using NormToLower[] array
|
|
// - it will therefore decode the supplied UTF-8 content to handle more than
|
|
// 7-bit of ascii characters
|
|
function LowerCaseU(const S: RawUtf8): RawUtf8;
|
|
|
|
/// fast conversion of the supplied text into 8-bit case sensitivity
|
|
// - convert the text in-place, returns the resulting length
|
|
// - it will decode the supplied UTF-8 content to handle more than 7-bit
|
|
// of ascii characters during the conversion (leaving not WinAnsi characters
|
|
// untouched)
|
|
// - will not set the last char to #0 (caller must do that if necessary)
|
|
function ConvertCaseUtf8(P: PUtf8Char; const Table: TNormTableByte): PtrInt;
|
|
|
|
/// check if the supplied text has some case-insentitive 'a'..'z','A'..'Z' chars
|
|
// - will therefore be correct with true UTF-8 content, but only for 7-bit
|
|
function IsCaseSensitive(const S: RawUtf8): boolean; overload;
|
|
|
|
/// check if the supplied text has some case-insentitive 'a'..'z','A'..'Z' chars
|
|
// - will therefore be correct with true UTF-8 content, but only for 7-bit
|
|
function IsCaseSensitive(P: PUtf8Char; PLen: PtrInt): boolean; overload;
|
|
|
|
/// low-level function called when inlining UpperCase(Copy) and LowerCase(Copy)
|
|
procedure CaseCopy(Text: PUtf8Char; Len: PtrInt; Table: PNormTable;
|
|
var Dest: RawUtf8);
|
|
|
|
/// low-level function called when inlining UpperCaseSelf and LowerCaseSelf
|
|
procedure CaseSelf(var S: RawUtf8; Table: PNormTable);
|
|
|
|
/// fast conversion of the supplied text into uppercase
|
|
// - this will only convert 'a'..'z' into 'A'..'Z' (no NormToUpper use), and
|
|
// will therefore be correct with true UTF-8 content, but only for 7-bit
|
|
function UpperCase(const S: RawUtf8): RawUtf8;
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
|
|
/// fast conversion of the supplied text into uppercase
|
|
// - this will only convert 'a'..'z' into 'A'..'Z' (no NormToUpper use), and
|
|
// will therefore be correct with true UTF-8 content, but only for 7-bit
|
|
procedure UpperCaseCopy(Text: PUtf8Char; Len: PtrInt; var Dest: RawUtf8); overload;
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
|
|
/// fast conversion of the supplied text into uppercase
|
|
// - this will only convert 'a'..'z' into 'A'..'Z' (no NormToUpper use), and
|
|
// will therefore be correct with true UTF-8 content, but only for 7-bit
|
|
procedure UpperCaseCopy(const Source: RawUtf8; var Dest: RawUtf8); overload;
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
|
|
/// fast in-place conversion of the supplied variable text into uppercase
|
|
// - this will only convert 'a'..'z' into 'A'..'Z' (no NormToUpper use), and
|
|
// will therefore be correct with true UTF-8 content, but only for 7-bit
|
|
procedure UpperCaseSelf(var S: RawUtf8);
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
|
|
/// fast conversion of the supplied text into lowercase
|
|
// - this will only convert 'A'..'Z' into 'a'..'z' (no NormToLower use), and
|
|
// will therefore be correct with true UTF-8 content
|
|
function LowerCase(const S: RawUtf8): RawUtf8;
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
|
|
/// fast conversion of the supplied text into lowercase
|
|
// - this will only convert 'A'..'Z' into 'a'..'z' (no NormToLower use), and
|
|
// will therefore be correct with true UTF-8 content
|
|
procedure LowerCaseCopy(Text: PUtf8Char; Len: PtrInt; var Dest: RawUtf8);
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
|
|
/// fast in-place conversion of the supplied variable text into lowercase
|
|
// - this will only convert 'A'..'Z' into 'a'..'z' (no NormToLower use), and
|
|
// will therefore be correct with true UTF-8 content, but only for 7-bit
|
|
procedure LowerCaseSelf(var S: RawUtf8);
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
|
|
/// accurate conversion of the supplied UTF-8 content into the corresponding
|
|
// upper-case Unicode characters
|
|
// - will use the available API (e.g. Win32 or ICU), so may not be consistent on
|
|
// all systems - consider UpperCaseReference() to use our Unicode 10.0 tables
|
|
// - will temporary decode S into and from UTF-16 so is likely to be slower
|
|
function UpperCaseUnicode(const S: RawUtf8): RawUtf8;
|
|
|
|
/// accurate conversion of the supplied UTF-8 content into the corresponding
|
|
// lower-case Unicode characters
|
|
// - will use the available API (e.g. Win32 or ICU), so may not be consistent on
|
|
// all systems - and also slower than LowerCase/LowerCaseU versions
|
|
function LowerCaseUnicode(const S: RawUtf8): RawUtf8;
|
|
|
|
/// use the RTL to convert the SynUnicode text to UpperCase
|
|
function UpperCaseSynUnicode(const S: SynUnicode): SynUnicode;
|
|
|
|
/// use the RTL to convert the SynUnicode text to LowerCase
|
|
function LowerCaseSynUnicode(const S: SynUnicode): SynUnicode;
|
|
|
|
/// fast WinAnsi comparison using the NormToUpper[] array for all 8-bit values
|
|
function AnsiIComp(Str1, Str2: pointer): PtrInt;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// internal function used when inlining PosExI()
|
|
function PosExIPas(pSub, p: PUtf8Char; Offset: PtrUInt;
|
|
Lookup: PNormTable): PtrInt;
|
|
|
|
/// a ASCII-7 case-insensitive version of PosEx()
|
|
// - will use the NormToUpperAnsi7 lookup table for character conversion
|
|
function PosExI(const SubStr, S: RawUtf8; Offset: PtrUInt): PtrInt; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// a case-insensitive version of PosEx() with a specified lookup table
|
|
// - redirect to mormot.core.base PosEx() if Lookup = nil
|
|
function PosExI(const SubStr, S: RawUtf8; Offset: PtrUInt;
|
|
Lookup: PNormTable): PtrInt; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
|
|
{ ************ UTF-8 String Manipulation Functions }
|
|
|
|
type
|
|
/// used to store a set of 8-bit encoded characters
|
|
TSynAnsicharSet = set of AnsiChar;
|
|
|
|
/// used to store a set of 8-bit unsigned integers
|
|
TSynByteSet = set of byte;
|
|
|
|
/// a generic callback, which can be used to translate some text on the fly
|
|
// - maps procedure TLanguageFile.Translate(var English: string) signature
|
|
// as defined in mORMoti18n.pas
|
|
// - can be used e.g. for TSynMustache's {{"English text}} callback
|
|
TOnStringTranslate = procedure(var English: string) of object;
|
|
|
|
|
|
/// check case-sensitive matching starting of text in start
|
|
// - returns true if the item matched
|
|
// - see StartWith() from mormot.core.unicode for a case-insensitive version
|
|
function StartWithExact(const text, textStart: RawUtf8): boolean;
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
|
|
/// check case-sensitive matching ending of text in ending
|
|
// - returns true if the item matched
|
|
// - see EndWith() from mormot.core.unicode for a case-insensitive version
|
|
function EndWithExact(const text, textEnd: RawUtf8): boolean;
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
|
|
/// extract a line from source array of chars
|
|
// - next will contain the beginning of next line, or nil if source has ended
|
|
function GetNextLine(source: PUtf8Char; out next: PUtf8Char;
|
|
andtrim: boolean = false): RawUtf8;
|
|
|
|
/// returns n leading characters
|
|
function LeftU(const S: RawUtf8; n: PtrInt): RawUtf8;
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
|
|
/// returns n trailing characters
|
|
function RightU(const S: RawUtf8; n: PtrInt): RawUtf8;
|
|
|
|
/// trims leading whitespace characters from the string by removing
|
|
// new line, space, and tab characters
|
|
function TrimLeft(const S: RawUtf8): RawUtf8;
|
|
|
|
/// trims trailing whitespace characters from the string by removing trailing
|
|
// newline, space, and tab characters
|
|
function TrimRight(const S: RawUtf8): RawUtf8;
|
|
|
|
/// trims leading whitespaces of every lines of the UTF-8 text
|
|
// - also delete void lines
|
|
// - could be used e.g. before FindNameValue() call
|
|
// - modification is made in-place so S will be modified
|
|
procedure TrimLeftLines(var S: RawUtf8);
|
|
|
|
/// trim some trailing and ending chars
|
|
// - if S is unique (RefCnt=1), will modify the RawUtf8 in place
|
|
// - faster alternative to S := copy(S, Left + 1, length(S) - Left - Right)
|
|
procedure TrimChars(var S: RawUtf8; Left, Right: PtrInt);
|
|
|
|
/// returns the supplied text content, without any specified char
|
|
// - specify a custom char set to be excluded, e.g. as [#0 .. ' ']
|
|
function TrimChar(const text: RawUtf8; const exclude: TSynAnsicharSet): RawUtf8;
|
|
|
|
/// returns the supplied text content, without one specified char
|
|
function TrimOneChar(const text: RawUtf8; exclude: AnsiChar): RawUtf8;
|
|
|
|
/// returns the supplied text content, without any other char than specified
|
|
// - specify a custom char set to be included, e.g. as ['A'..'Z']
|
|
function OnlyChar(const text: RawUtf8; const only: TSynAnsicharSet): RawUtf8;
|
|
|
|
/// returns the supplied text content, without any control char
|
|
// - here control chars have an ASCII code in [#0 .. ' '], i.e. text[] <= ' '
|
|
function TrimControlChars(const text: RawUtf8): RawUtf8;
|
|
|
|
/// split a RawUtf8 string into two strings, according to SepStr separator
|
|
// - returns true and LeftStr/RightStr if they were separated by SepStr
|
|
// - if SepStr is not found, LeftStr=Str and RightStr='' and returns false
|
|
// - if ToUpperCase is TRUE, then LeftStr and RightStr will be made uppercase
|
|
function Split(const Str, SepStr: RawUtf8; var LeftStr, RightStr: RawUtf8;
|
|
ToUpperCase: boolean = false): boolean; overload;
|
|
|
|
/// split a RawUtf8 string into two strings, according to SepStr separator
|
|
// - this overloaded function returns the right string as function result
|
|
// - if SepStr is not found, LeftStr=Str and result=''
|
|
// - if ToUpperCase is TRUE, then LeftStr and result will be made uppercase
|
|
function Split(const Str, SepStr: RawUtf8; var LeftStr: RawUtf8;
|
|
ToUpperCase: boolean = false): RawUtf8; overload;
|
|
|
|
/// split a RawUtf8 string into several strings, according to SepStr separator
|
|
// - this overloaded function will fill a DestPtr[] array of PRawUtf8
|
|
// - if any DestPtr[]=nil, the item will be skipped
|
|
// - if input Str end before al SepStr[] are found, DestPtr[] is set to ''
|
|
// - returns the number of values extracted into DestPtr[]
|
|
function Split(const Str: RawUtf8; const SepStr: array of RawUtf8;
|
|
const DestPtr: array of PRawUtf8): PtrInt; overload;
|
|
|
|
/// returns the last occurrence of the given SepChar separated context
|
|
// - e.g. SplitRight('01/2/34','/')='34'
|
|
// - if SepChar doesn't appear, will return Str, e.g. SplitRight('123','/')='123'
|
|
// - if LeftStr is supplied, the RawUtf8 it points to will be filled with
|
|
// the left part just before SepChar ('' if SepChar doesn't appear)
|
|
function SplitRight(const Str: RawUtf8; SepChar: AnsiChar; LeftStr: PRawUtf8 = nil): RawUtf8;
|
|
|
|
/// returns the last occurrence of the given SepChar separated context
|
|
// - e.g. SplitRight('path/one\two/file.ext','/\')='file.ext', i.e.
|
|
// SepChars='/\' will be like ExtractFileName() over RawUtf8 string
|
|
// - if SepChar doesn't appear, will return Str, e.g. SplitRight('123','/')='123'
|
|
function SplitRights(const Str, SepChar: RawUtf8): RawUtf8;
|
|
|
|
/// check all character within text are spaces or control chars
|
|
// - i.e. a faster alternative to if TrimU(text)='' then
|
|
function IsVoid(const text: RawUtf8): boolean;
|
|
|
|
/// fill all bytes of this memory buffer with zeros, i.e. 'toto' -> #0#0#0#0
|
|
// - will write the memory buffer directly, if this string instance is not shared
|
|
// (i.e. has refcount = 1), to avoid zeroing still-used values
|
|
// - may be used to cleanup stack-allocated content
|
|
// ! ... finally FillZero(secret); end;
|
|
procedure FillZero(var secret: RawByteString); overload;
|
|
|
|
/// fill all bytes of this UTF-8 string with zeros, i.e. 'toto' -> #0#0#0#0
|
|
// - will write the memory buffer directly, if this string instance is not shared
|
|
// (i.e. has refcount = 1), to avoid zeroing still-used values
|
|
// - may be used to cleanup stack-allocated content
|
|
// ! ... finally FillZero(secret); end;
|
|
procedure FillZero(var secret: RawUtf8); overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// fill all bytes of this UTF-8 string with zeros, i.e. 'toto' -> #0#0#0#0
|
|
// - SpiUtf8 type has been defined explicitly to store Sensitive Personal
|
|
// Information
|
|
procedure FillZero(var secret: SpiUtf8); overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// fill all bytes of this dynamic array of bytes with zeros
|
|
// - will write the memory buffer directly, if this array instance is not shared
|
|
// (i.e. has refcount = 1), to avoid zeroing still-used values
|
|
procedure FillZero(var secret: TBytes); overload;
|
|
|
|
{$ifdef HASVARUSTRING}
|
|
/// fill all bytes of this UTF-16 string with zeros, i.e. 'toto' -> #0#0#0#0
|
|
procedure FillZero(var secret: UnicodeString); overload;
|
|
{$endif HASVARUSTRING}
|
|
|
|
/// actual replacement function called by StringReplaceAll() on first match
|
|
// - not to be called as such, but defined globally for proper inlining
|
|
function StringReplaceAllProcess(const S, OldPattern, NewPattern: RawUtf8;
|
|
found: integer; Lookup: PNormTable): RawUtf8;
|
|
|
|
/// fast version of StringReplace(S, OldPattern, NewPattern, [rfReplaceAll]);
|
|
function StringReplaceAll(const S, OldPattern, NewPattern: RawUtf8;
|
|
Lookup: PNormTable = nil): RawUtf8; overload;
|
|
|
|
/// case-sensitive (or not) StringReplace(S, OldPattern, NewPattern,[rfReplaceAll])
|
|
// - calls plain StringReplaceAll() version for CaseInsensitive = false
|
|
// - calls StringReplaceAll(.., NormToUpperAnsi7) if CaseInsensitive = true
|
|
function StringReplaceAll(const S, OldPattern, NewPattern: RawUtf8;
|
|
CaseInsensitive: boolean): RawUtf8; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// fast version of several cascaded StringReplaceAll()
|
|
function StringReplaceAll(const S: RawUtf8;
|
|
const OldNewPatternPairs: array of RawUtf8;
|
|
CaseInsensitive: boolean = false): RawUtf8; overload;
|
|
|
|
/// fast replace of a specified char by a given string
|
|
function StringReplaceChars(const Source: RawUtf8; OldChar, NewChar: AnsiChar): RawUtf8;
|
|
|
|
/// fast replace of all #9 chars by a given string
|
|
function StringReplaceTabs(const Source, TabText: RawUtf8): RawUtf8;
|
|
|
|
/// UTF-8 dedicated (and faster) alternative to StringOfChar((Ch,Count))
|
|
function RawUtf8OfChar(Ch: AnsiChar; Count: integer): RawUtf8;
|
|
|
|
/// format a text content with SQL-like quotes
|
|
// - this function implements what is specified in the official SQLite3
|
|
// documentation: "A string constant is formed by enclosing the string in single
|
|
// quotes ('). A single quote within the string can be encoded by putting two
|
|
// single quotes in a row - as in Pascal."
|
|
function QuotedStr(const S: RawUtf8; Quote: AnsiChar = ''''): RawUtf8; overload;
|
|
|
|
/// format a text content with SQL-like quotes
|
|
procedure QuotedStr(const S: RawUtf8; Quote: AnsiChar; var result: RawUtf8); overload;
|
|
|
|
/// format a text buffer with SQL-like quotes
|
|
procedure QuotedStr(P: PUtf8Char; PLen: PtrInt; Quote: AnsiChar;
|
|
var result: RawUtf8); overload;
|
|
|
|
/// unquote a SQL-compatible string
|
|
// - the first character in P^ must be either ' or " then internal double quotes
|
|
// are transformed into single quotes
|
|
// - 'text '' end' -> text ' end
|
|
// - "text "" end" -> text " end
|
|
// - returns nil if P doesn't contain a valid SQL string
|
|
// - returns a pointer just after the quoted text otherwise
|
|
function UnQuoteSqlStringVar(P: PUtf8Char; out Value: RawUtf8): PUtf8Char;
|
|
|
|
/// unquote a SQL-compatible string
|
|
function UnQuoteSqlString(const Value: RawUtf8): RawUtf8;
|
|
|
|
/// unquote a SQL-compatible symbol name
|
|
// - e.g. '[symbol]' -> 'symbol' or '"symbol"' -> 'symbol'
|
|
function UnQuotedSqlSymbolName(const ExternalDBSymbol: RawUtf8): RawUtf8;
|
|
|
|
/// get the next character after a quoted buffer
|
|
// - the first character in P^ must be either ', either "
|
|
// - it will return the latest quote position, ignoring double quotes within
|
|
function GotoEndOfQuotedString(P: PUtf8Char): PUtf8Char;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// get the next character not in [#1..' ']
|
|
function GotoNextNotSpace(P: PUtf8Char): PUtf8Char;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// get the next character not in [#9,' ']
|
|
function GotoNextNotSpaceSameLine(P: PUtf8Char): PUtf8Char;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// get the next character in [#0..' ']
|
|
function GotoNextSpace(P: PUtf8Char): PUtf8Char;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// check if the next character not in [#1..' '] matchs a given value
|
|
// - first ignore any non space character
|
|
// - then returns TRUE if P^=ch, setting P to the character after ch
|
|
// - or returns FALSE if P^<>ch, leaving P at the level of the unexpected char
|
|
function NextNotSpaceCharIs(var P: PUtf8Char; ch: AnsiChar): boolean;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// retrieve the next SQL-like identifier within the UTF-8 buffer
|
|
// - will also trim any space (or line feeds) and trailing ';'
|
|
// - any comment like '/*nocache*/' will be ignored
|
|
// - returns true if something was set to Prop
|
|
function GetNextFieldProp(var P: PUtf8Char; var Prop: RawUtf8): boolean;
|
|
|
|
/// retrieve the next identifier within the UTF-8 buffer on the same line
|
|
// - GetNextFieldProp() will just handle line feeds (and ';') as spaces - which
|
|
// is fine e.g. for SQL, but not for regular config files with name/value pairs
|
|
// - returns true if something was set to Prop
|
|
function GetNextFieldPropSameLine(var P: PUtf8Char; var Prop: ShortString): boolean;
|
|
|
|
/// return true if IdemPChar(source,searchUp), and go to the next line of source
|
|
function IdemPCharAndGetNextLine(var source: PUtf8Char; searchUp: PAnsiChar): boolean;
|
|
|
|
/// search for a value from its uppercased named entry
|
|
// - i.e. iterate IdemPChar(source,UpperName) over every line of the source
|
|
// - returns the text just after UpperName if it has been found at line beginning
|
|
// - returns nil if UpperName was not found at any line beginning
|
|
// - could be used e.g. to efficently extract a value from HTTP headers, whereas
|
|
// FindIniNameValue() is tuned for [section]-oriented INI files
|
|
function FindNameValue(P: PUtf8Char; UpperName: PAnsiChar): PUtf8Char; overload;
|
|
|
|
/// search and returns a value from its uppercased named entry
|
|
// - i.e. iterate IdemPChar(source,UpperName) over every line of the source
|
|
// - returns true and the trimmed text just after UpperName into Value
|
|
// if it has been found at line beginning
|
|
// - returns false and set Value := '' if UpperName was not found (or leave
|
|
// Value untouched if KeepNotFoundValue is true)
|
|
// - could be used e.g. to efficently extract a value from HTTP headers, whereas
|
|
// FindIniNameValue() is tuned for [section]-oriented INI files
|
|
// - do TrimLeftLines(NameValuePairs) first if the lines start with spaces/tabs
|
|
function FindNameValue(const NameValuePairs: RawUtf8; UpperName: PAnsiChar;
|
|
var Value: RawUtf8; KeepNotFoundValue: boolean = false;
|
|
UpperNameSeparator: AnsiChar = #0): boolean; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// search and returns a PUtf8Char value from its uppercased named entry
|
|
// - as called when inlining FindNameValue()
|
|
// - won't make any memory allocation, so could be fine for a quick lookup
|
|
function FindNameValuePointer(NameValuePairs: PUtf8Char; UpperName: PAnsiChar;
|
|
out FoundLen: PtrInt; UpperNameSeparator: AnsiChar): PUtf8Char;
|
|
|
|
/// compute the line length from source array of chars
|
|
// - if PEnd = nil, end counting at either #0, #13 or #10
|
|
// - otherwise, end counting at either #13 or #10
|
|
// - just a wrapper around BufferLineLength() checking PEnd=nil case
|
|
function GetLineSize(P, PEnd: PUtf8Char): PtrUInt;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// returns true if the line length from source array of chars is not less than
|
|
// the specified count
|
|
function GetLineSizeSmallerThan(P, PEnd: PUtf8Char; aMinimalCount: integer): boolean;
|
|
|
|
{$ifndef PUREMORMOT2}
|
|
/// return next string delimited with #13#10 from P, nil if no more
|
|
// - this function returns a RawUnicode string type
|
|
function GetNextStringLineToRawUnicode(var P: PChar): RawUnicode;
|
|
{$endif PUREMORMOT2}
|
|
|
|
/// trim first lowercase chars ('otDone' will return 'Done' e.g.)
|
|
// - return a PUtf8Char to avoid any memory allocation
|
|
function TrimLeftLowerCase(const V: RawUtf8): PUtf8Char;
|
|
|
|
/// trim first lowercase chars ('otDone' will return 'Done' e.g.)
|
|
// - return an RawUtf8 string: enumeration names are pure 7-bit ANSI with Delphi 7
|
|
// to 2007, and UTF-8 encoded with Delphi 2009+
|
|
function TrimLeftLowerCaseShort(V: PShortString): RawUtf8;
|
|
|
|
/// trim first lowercase chars ('otDone' will return 'Done' e.g.)
|
|
// - return a ShortString: enumeration names are pure 7-bit ANSI with Delphi 7
|
|
// to 2007, and UTF-8 encoded with Delphi 2009+
|
|
function TrimLeftLowerCaseToShort(V: PShortString): ShortString; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// trim first lowercase chars ('otDone' will return 'Done' e.g.)
|
|
// - return a ShortString: enumeration names are pure 7-bit ANSI with Delphi 7
|
|
// to 2007, and UTF-8 encoded with Delphi 2009+
|
|
procedure TrimLeftLowerCaseToShort(V: PShortString; out result: ShortString); overload;
|
|
|
|
/// fast append some UTF-8 text into a ShortString, with an ending ','
|
|
procedure AppendShortComma(text: PAnsiChar; len: PtrInt; var result: ShortString;
|
|
trimlowercase: boolean); {$ifdef FPC} inline; {$endif}
|
|
|
|
/// fast search of an exact case-insensitive match of a RTTI's PShortString array
|
|
function FindShortStringListExact(List: PShortString; MaxValue: integer;
|
|
aValue: PUtf8Char; aValueLen: PtrInt): integer;
|
|
|
|
/// fast case-insensitive search of a left-trimmed lowercase match
|
|
// of a RTTI's PShortString array
|
|
function FindShortStringListTrimLowerCase(List: PShortString; MaxValue: integer;
|
|
aValue: PUtf8Char; aValueLen: PtrInt): integer;
|
|
|
|
/// fast case-sensitive search of a left-trimmed lowercase match
|
|
// of a RTTI's PShortString array
|
|
function FindShortStringListTrimLowerCaseExact(List: PShortString; MaxValue: integer;
|
|
aValue: PUtf8Char; aValueLen: PtrInt): integer;
|
|
|
|
/// convert a CamelCase string into a space separated one
|
|
// - 'OnLine' will return 'On line' e.g., and 'OnMyLINE' will return 'On my LINE'
|
|
// - will handle capital words at the beginning, middle or end of the text, e.g.
|
|
// 'KLMFlightNumber' will return 'KLM flight number' and 'GoodBBCProgram' will
|
|
// return 'Good BBC program'
|
|
// - will handle a number at the beginning, middle or end of the text, e.g.
|
|
// 'Email12' will return 'Email 12'
|
|
// - '_' char is transformed into ' - '
|
|
// - '__' chars are transformed into ': '
|
|
// - return an RawUtf8 string: enumeration names are pure 7-bit ANSI with Delphi
|
|
// up to 2007, and UTF-8 encoded with Delphi 2009+
|
|
function UnCamelCase(const S: RawUtf8): RawUtf8; overload;
|
|
|
|
/// convert a CamelCase string into a space separated one
|
|
// - 'OnLine' will return 'On line' e.g., and 'OnMyLINE' will return 'On my LINE'
|
|
// - will handle capital words at the beginning, middle or end of the text, e.g.
|
|
// 'KLMFlightNumber' will return 'KLM flight number' and 'GoodBBCProgram' will
|
|
// return 'Good BBC program'
|
|
// - will handle a number at the beginning, middle or end of the text, e.g.
|
|
// 'Email12' will return 'Email 12'
|
|
// - return the char count written into D^
|
|
// - D^ and P^ are expected to be UTF-8 encoded: enumeration and property names
|
|
// are pure 7-bit ANSI with Delphi 7 to 2007, and UTF-8 encoded with Delphi 2009+
|
|
// - '_' char is transformed into ' - '
|
|
// - '__' chars are transformed into ': '
|
|
function UnCamelCase(D, P: PUtf8Char): integer; overload;
|
|
|
|
/// convert a string into an human-friendly CamelCase identifier
|
|
// - replacing spaces or punctuations by an uppercase character
|
|
// - as such, it is not the reverse function to UnCamelCase()
|
|
procedure CamelCase(P: PAnsiChar; len: PtrInt; var s: RawUtf8;
|
|
const isWord: TSynByteSet = [ord('0')..ord('9'), ord('a')..ord('z'), ord('A')..ord('Z')]); overload;
|
|
|
|
/// convert a string into an human-friendly CamelCase identifier
|
|
// - replacing spaces or punctuations by an uppercase character
|
|
// - as such, it is not the reverse function to UnCamelCase()
|
|
procedure CamelCase(const text: RawUtf8; var s: RawUtf8;
|
|
const isWord: TSynByteSet = [ord('0')..ord('9'), ord('a')..ord('z'), ord('A')..ord('Z')]); overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
var
|
|
/// these procedure type must be defined if a default system.pas is used
|
|
// - expect generic "string" type, i.e. UnicodeString for Delphi 2009+
|
|
LoadResStringTranslate: procedure(var Text: string) = nil;
|
|
|
|
/// UnCamelCase and translate a char buffer
|
|
// - P is expected to be #0 ended
|
|
// - return "string" type, i.e. UnicodeString for Delphi 2009+
|
|
procedure GetCaptionFromPCharLen(P: PUtf8Char; out result: string);
|
|
|
|
|
|
{ ************ TRawUtf8DynArray Processing Functions }
|
|
|
|
/// returns TRUE if Value is nil or all supplied Values[] equal ''
|
|
function IsZero(const Values: TRawUtf8DynArray): boolean; overload;
|
|
|
|
/// quick helper to initialize a dynamic array of RawUtf8 from some constants
|
|
// - can be used e.g. as:
|
|
// ! MyArray := TRawUtf8DynArrayFrom(['a','b','c']);
|
|
function TRawUtf8DynArrayFrom(const Values: array of RawUtf8): TRawUtf8DynArray;
|
|
|
|
/// low-level efficient search of Value in Values[]
|
|
// - CaseSensitive=false will use StrICmp() for A..Z / a..z equivalence
|
|
function FindRawUtf8(Values: PRawUtf8; const Value: RawUtf8; ValuesCount: integer;
|
|
CaseSensitive: boolean): integer; overload;
|
|
|
|
/// return the index of Value in Values[], -1 if not found
|
|
// - CaseSensitive=false will use StrICmp() for A..Z / a..z equivalence
|
|
function FindRawUtf8(const Values: TRawUtf8DynArray; const Value: RawUtf8;
|
|
CaseSensitive: boolean = true): integer; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// return the index of Value in Values[], -1 if not found
|
|
// - CaseSensitive=false will use StrICmp() for A..Z / a..z equivalence
|
|
function FindRawUtf8(const Values: array of RawUtf8; const Value: RawUtf8;
|
|
CaseSensitive: boolean = true): integer; overload;
|
|
|
|
/// true if Value was added successfully in Values[]
|
|
function AddRawUtf8(var Values: TRawUtf8DynArray; const Value: RawUtf8;
|
|
NoDuplicates: boolean = false; CaseSensitive: boolean = true): boolean; overload;
|
|
|
|
/// add the Value to Values[], with an external count variable, for performance
|
|
function AddRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer;
|
|
const Value: RawUtf8): PtrInt; overload;
|
|
|
|
/// add Value[] items to Values[]
|
|
procedure AddRawUtf8(var Values: TRawUtf8DynArray; const Value: TRawUtf8DynArray); overload;
|
|
|
|
/// add Value[] items to Values[], with an external count variable, for performance
|
|
procedure AddRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer;
|
|
const Value: TRawUtf8DynArray); overload;
|
|
|
|
/// true if both TRawUtf8DynArray are the same
|
|
// - comparison is case-sensitive
|
|
function RawUtf8DynArrayEquals(const A, B: TRawUtf8DynArray): boolean; overload;
|
|
|
|
/// true if both TRawUtf8DynArray are the same for a given number of items
|
|
// - A and B are expected to have at least Count items
|
|
// - comparison is case-sensitive
|
|
function RawUtf8DynArrayEquals(const A, B: TRawUtf8DynArray;
|
|
Count: integer): boolean; overload;
|
|
|
|
/// add the Value to Values[] string array
|
|
function AddString(var Values: TStringDynArray; const Value: string): PtrInt;
|
|
|
|
/// convert the string dynamic array into a dynamic array of UTF-8 strings
|
|
procedure StringDynArrayToRawUtf8DynArray(const Source: TStringDynArray;
|
|
var result: TRawUtf8DynArray);
|
|
|
|
/// convert the string list into a dynamic array of UTF-8 strings
|
|
procedure StringListToRawUtf8DynArray(Source: TStringList;
|
|
var result: TRawUtf8DynArray);
|
|
|
|
/// retrieve the index where to insert a PUtf8Char in a sorted PUtf8Char array
|
|
// - R is the last index of available entries in P^ (i.e. Count-1)
|
|
// - string comparison is case-sensitive StrComp (so will work with any PAnsiChar)
|
|
// - returns -1 if the specified Value was found (i.e. adding will duplicate a value)
|
|
// - will use fast O(log(n)) binary search algorithm
|
|
function FastLocatePUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt;
|
|
Value: PUtf8Char): PtrInt; overload;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// retrieve the index where to insert a PUtf8Char in a sorted PUtf8Char array
|
|
// - this overloaded function accept a custom comparison function for sorting
|
|
// - R is the last index of available entries in P^ (i.e. Count-1)
|
|
// - string comparison is case-sensitive (so will work with any PAnsiChar)
|
|
// - returns -1 if the specified Value was found (i.e. adding will duplicate a value)
|
|
// - will use fast O(log(n)) binary search algorithm
|
|
function FastLocatePUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt;
|
|
Value: PUtf8Char; Compare: TUtf8Compare): PtrInt; overload;
|
|
|
|
/// retrieve the index where is located a PUtf8Char in a sorted PUtf8Char array
|
|
// - R is the last index of available entries in P^ (i.e. Count-1)
|
|
// - string comparison is case-sensitive StrComp (so will work with any PAnsiChar)
|
|
// - returns -1 if the specified Value was not found
|
|
// - will use inlined binary search algorithm with optimized x86_64 branchless asm
|
|
// - slightly faster than plain FastFindPUtf8CharSorted(P,R,Value,@StrComp)
|
|
function FastFindPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt;
|
|
Value: PUtf8Char): PtrInt; overload;
|
|
|
|
/// retrieve the index where is located a PUtf8Char in a sorted uppercase array
|
|
// - P[] array is expected to be already uppercased
|
|
// - searched Value is converted to uppercase before search via UpperCopy255Buf(),
|
|
// so is expected to be short, i.e. length < 250
|
|
// - R is the last index of available entries in P^ (i.e. Count-1)
|
|
// - returns -1 if the specified Value was not found
|
|
// - will use fast O(log(n)) binary search algorithm
|
|
// - slightly faster than plain FastFindPUtf8CharSorted(P,R,Value,@StrIComp)
|
|
function FastFindUpperPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt;
|
|
Value: PUtf8Char; ValueLen: PtrInt): PtrInt;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
|
|
/// retrieve the index where is located a PUtf8Char in a sorted PUtf8Char array
|
|
// - R is the last index of available entries in P^ (i.e. Count-1)
|
|
// - string comparison will use the specified Compare function
|
|
// - returns -1 if the specified Value was not found
|
|
// - will use fast O(log(n)) binary search algorithm
|
|
function FastFindPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt;
|
|
Value: PUtf8Char; Compare: TUtf8Compare): PtrInt; overload;
|
|
|
|
/// retrieve the index of a PUtf8Char in a PUtf8Char array via a sort indexed
|
|
// - will use fast O(log(n)) binary search algorithm
|
|
function FastFindIndexedPUtf8Char(P: PPUtf8CharArray; R: PtrInt;
|
|
var SortedIndexes: TCardinalDynArray; Value: PUtf8Char;
|
|
ItemComp: TUtf8Compare): PtrInt;
|
|
|
|
/// add a RawUtf8 value in an alphaticaly sorted dynamic array of RawUtf8
|
|
// - returns the index where the Value was added successfully in Values[]
|
|
// - returns -1 if the specified Value was already present in Values[]
|
|
// (we must avoid any duplicate for O(log(n)) binary search)
|
|
// - if CoValues is set, its content will be moved to allow inserting a new
|
|
// value at CoValues[result] position - a typical usage of CoValues is to store
|
|
// the corresponding ID to each RawUtf8 item
|
|
// - if FastLocatePUtf8CharSorted() has been already called, this index can
|
|
// be set to optional ForceIndex parameter
|
|
// - by default, exact (case-sensitive) match is used; you can specify a custom
|
|
// compare function if needed in Compare optional parameter
|
|
function AddSortedRawUtf8(var Values: TRawUtf8DynArray;
|
|
var ValuesCount: integer; const Value: RawUtf8;
|
|
CoValues: PIntegerDynArray = nil; ForcedIndex: PtrInt = -1;
|
|
Compare: TUtf8Compare = nil): PtrInt;
|
|
|
|
/// delete a RawUtf8 item in a dynamic array of RawUtf8
|
|
// - if CoValues is set, the integer item at the same index is also deleted
|
|
function DeleteRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer;
|
|
Index: integer; CoValues: PIntegerDynArray = nil): boolean; overload;
|
|
|
|
/// delete a RawUtf8 item in a dynamic array of RawUtf8;
|
|
function DeleteRawUtf8(var Values: TRawUtf8DynArray;
|
|
Index: PtrInt): boolean; overload;
|
|
|
|
/// sort a dynamic array of RawUtf8 items
|
|
// - if CoValues is set, the integer items are also synchronized
|
|
// - by default, exact (case-sensitive) match is used; you can specify a custom
|
|
// compare function if needed in Compare optional parameter
|
|
procedure QuickSortRawUtf8(var Values: TRawUtf8DynArray; ValuesCount: integer;
|
|
CoValues: PIntegerDynArray = nil; Compare: TUtf8Compare = nil); overload;
|
|
|
|
/// sort a RawUtf8 array, low values first
|
|
procedure QuickSortRawUtf8(Values: PRawUtf8Array; L, R: PtrInt;
|
|
caseInsensitive: boolean = false); overload;
|
|
|
|
{$ifdef OSPOSIX}
|
|
type
|
|
/// monitor a POSIX folder for all its file names, and allow efficient
|
|
// case-insensitive search, as it would on a Windows file system
|
|
// - will use our fast PosixFileNames() low-level API to read the names
|
|
// and store them into its in-memory cache (until Flush or after FlushSeconds)
|
|
TPosixFileCaseInsensitive = class
|
|
protected
|
|
fSafe: TRWLightLock;
|
|
fFiles: TRawUtf8DynArray;
|
|
fFolder: TFileName;
|
|
fNextTix, fFlushSeconds: integer;
|
|
fSubFolders: boolean;
|
|
procedure SetFolder(const aFolder: TFileName);
|
|
procedure SetSubFolders(aSubFolders: boolean);
|
|
public
|
|
/// initialize the file names lookup
|
|
constructor Create(const aFolder: TFileName; aSubFolders: boolean); reintroduce;
|
|
/// to be called on a regular pace (e.g. every second) to perform FlushSeconds
|
|
procedure OnIdle(tix64: Int64);
|
|
/// clear the internal list to force full reload of the directory
|
|
procedure Flush;
|
|
/// case-insensitive search for a given TFileName in the folder
|
|
// - returns '' if not found, or the exact file name in the POSIX folder
|
|
// - is thread-safe and non blocking during its lookup
|
|
// - can optionally return MicroSec spent for actual filenames read on disk
|
|
function Find(const aSearched: TFileName; aReadMs: PInteger = nil): TFileName;
|
|
/// how many file entries are currently in the internal list
|
|
function Count: PtrInt;
|
|
/// make a dynamic array copy of the internal file names, sorted by StrIComp
|
|
function Files: TRawUtf8DynArray;
|
|
/// allow to change the monitored folder at runtime
|
|
property Folder: TFileName
|
|
read fFolder write SetFolder;
|
|
/// define if sub-folders should also be included to the internal list
|
|
property SubFolders: boolean
|
|
read fSubFolders write SetSubFolders;
|
|
/// after how many seconds OnIdle() should flush the internal cache
|
|
// - default is 60, i.e. 1 minute
|
|
// - you can set 0 to disable any auto-flush from OnIdle()
|
|
property FlushSeconds: integer
|
|
read fFlushSeconds write fFlushSeconds;
|
|
end;
|
|
{$endif OSPOSIX}
|
|
|
|
|
|
{ ************** Operating-System Independent Unicode Process }
|
|
|
|
/// UpperCase conversion of a UTF-8 buffer using our Unicode 10.0 tables
|
|
// - won't call the Operating System, so is consistent on all platforms,
|
|
// whereas UpperCaseUnicode() may vary depending on each library implementation
|
|
// - some codepoints enhance in length, so D^ should be at least twice than S^
|
|
// - any invalid input is replaced by UNICODE_REPLACEMENT_CHARACTER=$fffd
|
|
// - won't use temporary UTF-16 decoding, and optimized for plain ASCII content
|
|
function Utf8UpperReference(S, D: PUtf8Char): PUtf8Char; overload;
|
|
|
|
/// UpperCase conversion of a UTF-8 buffer using our Unicode 10.0 tables
|
|
// - won't call the Operating System, so is consistent on all platforms,
|
|
// whereas UpperCaseUnicode() may vary depending on each library implementation
|
|
// - some codepoints enhance in length, so D^ should be at least twice than S^
|
|
// - any invalid input is replaced by UNICODE_REPLACEMENT_CHARACTER=$fffd
|
|
// - won't use temporary UTF-16 decoding, and optimized for plain ASCII content
|
|
// - knowing the Source length, this function will handle any ASCII 7-bit input
|
|
// by quad, for efficiency
|
|
function Utf8UpperReference(S, D: PUtf8Char; SLen: PtrUInt): PUtf8Char; overload;
|
|
|
|
/// UpperCase conversion of a UTF-8 string using our Unicode 10.0 tables
|
|
// - won't call the Operating System, so is consistent on all platforms,
|
|
// whereas UpperCaseUnicode() may vary depending on each library implementation
|
|
// - won't use temporary UTF-16 decoding, and optimized for plain ASCII content
|
|
function UpperCaseReference(const S: RawUtf8): RawUtf8;
|
|
|
|
/// UTF-8 comparison using our Unicode 10.0 tables
|
|
// - this version expects u1 and u2 to be zero-terminated
|
|
// - Utf8IComp() handles WinAnsi CP-1252 latin accents - this one is Unicode
|
|
// - won't call the Operating System, so is consistent on all platforms, and
|
|
// don't require any temporary UTF-16 decoding
|
|
// - has a branchless optimized process of 7-bit ASCII charset [a..z] -> [A..Z]
|
|
function Utf8ICompReference(u1, u2: PUtf8Char): PtrInt;
|
|
|
|
/// UTF-8 comparison using our Unicode 10.0 tables
|
|
// - this version expects u1 and u2 not to be necessary zero-terminated, but
|
|
// uses L1 and L2 as length for u1 and u2 respectively
|
|
// - Utf8ILComp() handles WinAnsi CP-1252 latin accents - this one is Unicode
|
|
// - won't call the Operating System, so is consistent on all platforms, and
|
|
// don't require any temporary UTF-16 decoding
|
|
// - has a branchless optimized process of 7-bit ASCII charset [a..z] -> [A..Z]
|
|
function Utf8ILCompReference(u1, u2: PUtf8Char; L1, L2: integer): PtrInt;
|
|
|
|
/// UpperCase conversion of UTF-8 into UCS4 using our Unicode 10.0 tables
|
|
// - won't call the Operating System, so is consistent on all platforms,
|
|
// whereas UpperCaseUnicode() may vary depending on each library implementation
|
|
function UpperCaseUcs4Reference(const S: RawUtf8): RawUcs4;
|
|
|
|
/// UTF-8 Unicode 10.0 case-insensitive Pattern search within UTF-8 buffer
|
|
// - returns nil if no match, or the Pattern position found inside U^
|
|
// - Up should have been already converted using UpperCaseUcs4Reference()
|
|
// - won't call the Operating System, so is consistent on all platforms, and
|
|
// don't require any temporary UTF-16 decoding
|
|
function StrPosIReference(U: PUtf8Char; const Up: RawUcs4): PUtf8Char;
|
|
|
|
|
|
implementation
|
|
|
|
|
|
{ *************** UTF-8 Efficient Encoding / Decoding }
|
|
|
|
{ TUtf8Table }
|
|
|
|
function TUtf8Table.GetHighUtf8Ucs4(var U: PUtf8Char): Ucs4CodePoint;
|
|
var
|
|
x, i: PtrInt;
|
|
v: byte;
|
|
c: PtrUInt;
|
|
begin
|
|
result := 0;
|
|
c := byte(U^); // here c=U^>=#80
|
|
inc(U);
|
|
x := Lookup[c];
|
|
if x = UTF8_INVALID then
|
|
exit;
|
|
i := 0;
|
|
repeat
|
|
v := byte(U[i]);
|
|
if v and $c0 <> $80 then
|
|
exit; // invalid input content
|
|
c := (c shl 6) + v;
|
|
inc(i);
|
|
until i = x;
|
|
inc(U, x);
|
|
with Extra[x] do
|
|
begin
|
|
dec(c, offset);
|
|
if c < minimum then
|
|
exit; // invalid input content
|
|
end;
|
|
result := c;
|
|
end;
|
|
|
|
function GetHighUtf8Ucs4(var U: PUtf8Char): Ucs4CodePoint;
|
|
begin
|
|
result := UTF8_TABLE.GetHighUtf8Ucs4(U);
|
|
end;
|
|
|
|
function GetUtf8WideChar(P: PUtf8Char): cardinal;
|
|
begin
|
|
if P <> nil then
|
|
begin
|
|
result := byte(P^);
|
|
if result > $7f then
|
|
begin
|
|
result := UTF8_TABLE.GetHighUtf8Ucs4(P);
|
|
if result > $ffff then
|
|
// surrogates can't be stored in a single UTF-16 WideChar
|
|
result := UNICODE_REPLACEMENT_CHARACTER;
|
|
end;
|
|
end
|
|
else
|
|
result := PtrUInt(P);
|
|
end;
|
|
|
|
function NextUtf8Ucs4(var P: PUtf8Char): Ucs4CodePoint;
|
|
begin
|
|
if P <> nil then
|
|
begin
|
|
result := byte(P[0]);
|
|
if result <= 127 then
|
|
inc(P)
|
|
else
|
|
if result and $20 = 0 then
|
|
begin
|
|
// fast $0..$7ff process
|
|
result := (result shl 6) + byte(P[1]) - $3080;
|
|
inc(P, 2);
|
|
end
|
|
else
|
|
// complex but efficient wrapper handling even UTF-16 surrogates
|
|
result := UTF8_TABLE.GetHighUtf8Ucs4(P);
|
|
end
|
|
else
|
|
result := 0;
|
|
end;
|
|
|
|
function Ucs4ToUtf8(ucs4: Ucs4CodePoint; Dest: PUtf8Char): PtrInt;
|
|
var
|
|
j: PtrInt;
|
|
begin
|
|
if ucs4 <= $7f then
|
|
begin
|
|
Dest^ := AnsiChar(ucs4);
|
|
result := 1;
|
|
end
|
|
else if ucs4 <= $7ff then
|
|
begin
|
|
Dest[0] := AnsiChar($C0 or (ucs4 shr 6));
|
|
Dest[1] := AnsiChar($80 or (ucs4 and $3F));
|
|
result := 2;
|
|
end
|
|
else if ucs4 <= $ffff then
|
|
begin
|
|
Dest[0] := AnsiChar($E0 or (ucs4 shr 12));
|
|
Dest[1] := AnsiChar($80 or ((ucs4 shr 6) and $3F));
|
|
Dest[2] := AnsiChar($80 or (ucs4 and $3F));
|
|
result := 3;
|
|
end
|
|
else
|
|
begin
|
|
// here ucs4 > $ffff
|
|
if ucs4 <= $1FFFFF then
|
|
result := 4
|
|
else if ucs4 <= $3FFFFFF then
|
|
result := 5
|
|
else
|
|
result := 6;
|
|
j := result - 1;
|
|
repeat
|
|
Dest[j] := AnsiChar((ucs4 and $3f) or $80);
|
|
ucs4 := ucs4 shr 6;
|
|
dec(j);
|
|
until j = 0;
|
|
Dest^ := AnsiChar(byte(ucs4) or UTF8_TABLE.FirstByte[result]);
|
|
end;
|
|
end;
|
|
|
|
function Utf16CharToUtf8(Dest: PUtf8Char; var Source: PWord): integer;
|
|
var
|
|
c: cardinal;
|
|
begin
|
|
c := Source^;
|
|
inc(Source);
|
|
case c of
|
|
0..$7f:
|
|
begin
|
|
Dest^ := AnsiChar(c);
|
|
result := 1;
|
|
exit;
|
|
end;
|
|
UTF16_HISURROGATE_MIN..UTF16_HISURROGATE_MAX:
|
|
begin
|
|
c := ((c - UTF16_SURROGATE_OFFSET) shl 10) or
|
|
(Source^ xor UTF16_LOSURROGATE_MIN);
|
|
inc(Source);
|
|
end;
|
|
UTF16_LOSURROGATE_MIN..UTF16_LOSURROGATE_MAX:
|
|
begin
|
|
c := ((cardinal(Source^) - UTF16_SURROGATE_OFFSET) shl 10) or
|
|
(c xor UTF16_LOSURROGATE_MIN);
|
|
inc(Source);
|
|
end;
|
|
end;
|
|
// now c is the UTF-32/UCS4 code point
|
|
result := Ucs4ToUtf8(c, Dest);
|
|
end;
|
|
|
|
function RawUnicodeToUtf8(Dest: PUtf8Char; DestLen: PtrInt; Source: PWideChar;
|
|
SourceLen: PtrInt; Flags: TCharConversionFlags): PtrInt;
|
|
var
|
|
c: cardinal;
|
|
Tail: PWideChar;
|
|
i, j: integer;
|
|
label
|
|
unmatch;
|
|
begin
|
|
result := PtrInt(Dest);
|
|
inc(DestLen, PtrInt(Dest));
|
|
if (Source <> nil) and
|
|
(SourceLen > 0) and
|
|
(Dest <> nil) then
|
|
begin
|
|
// ignore any trailing BOM (do exist on Windows files)
|
|
if Source^ = BOM_UTF16LE then
|
|
begin
|
|
inc(Source);
|
|
dec(SourceLen);
|
|
end;
|
|
// first handle 7-bit ASCII WideChars, by pairs (Sha optimization)
|
|
SourceLen := SourceLen * 2 + PtrInt(PtrUInt(Source));
|
|
Tail := PWideChar(SourceLen) - 2;
|
|
if (PtrInt(PtrUInt(Dest)) < DestLen) and
|
|
(Source <= Tail) then
|
|
repeat
|
|
c := PCardinal(Source)^;
|
|
if c and $ff80ff80 <> 0 then
|
|
break; // break on first non ASCII pair
|
|
inc(Source, 2);
|
|
c := c shr 8 or c;
|
|
PWord(Dest)^ := c;
|
|
inc(Dest, 2);
|
|
until (Source > Tail) or
|
|
(PtrInt(PtrUInt(Dest)) >= DestLen);
|
|
// generic loop, handling one UCS4 CodePoint per iteration
|
|
if (PtrInt(PtrUInt(Dest)) < DestLen) and
|
|
(PtrInt(PtrUInt(Source)) < SourceLen) then
|
|
repeat
|
|
// inlined Utf16CharToUtf8() with bufferoverlow check and $FFFD on unmatch
|
|
c := cardinal(Source^);
|
|
inc(Source);
|
|
case c of
|
|
0..$7f:
|
|
begin
|
|
Dest^ := AnsiChar(c);
|
|
inc(Dest);
|
|
if (PtrInt(PtrUInt(Dest)) < DestLen) and
|
|
(PtrInt(PtrUInt(Source)) < SourceLen) then
|
|
continue
|
|
else
|
|
break;
|
|
end;
|
|
UTF16_HISURROGATE_MIN..UTF16_HISURROGATE_MAX:
|
|
if (PtrInt(PtrUInt(Source)) >= SourceLen) or
|
|
((cardinal(Source^) < UTF16_LOSURROGATE_MIN) or
|
|
(cardinal(Source^) > UTF16_LOSURROGATE_MAX)) then
|
|
begin
|
|
unmatch: if (PtrInt(PtrUInt(@Dest[3])) > DestLen) or
|
|
not (ccfReplacementCharacterForUnmatchedSurrogate in Flags) then
|
|
break;
|
|
PWord(Dest)^ := $BFEF; // UTF-8 UNICODE_REPLACEMENT_CHARACTER
|
|
Dest[2] := AnsiChar($BD);
|
|
inc(Dest, 3);
|
|
if (PtrInt(PtrUInt(Dest)) < DestLen) and
|
|
(PtrInt(PtrUInt(Source)) < SourceLen) then
|
|
continue
|
|
else
|
|
break;
|
|
end
|
|
else
|
|
begin
|
|
c := ((c - UTF16_SURROGATE_OFFSET) shl 10) or
|
|
(cardinal(Source^) xor UTF16_LOSURROGATE_MIN);
|
|
inc(Source);
|
|
end;
|
|
UTF16_LOSURROGATE_MIN..UTF16_LOSURROGATE_MAX:
|
|
if (PtrInt(PtrUInt(Source)) >= SourceLen) or
|
|
((cardinal(Source^) < UTF16_HISURROGATE_MIN) or
|
|
(cardinal(Source^) > UTF16_HISURROGATE_MAX)) then
|
|
goto unmatch
|
|
else
|
|
begin
|
|
c := ((cardinal(Source^) - UTF16_SURROGATE_OFFSET) shl 10) or
|
|
(c xor UTF16_LOSURROGATE_MIN);
|
|
inc(Source);
|
|
end;
|
|
end; // now c is the UTF-32/UCS4 code point
|
|
if c <= $7ff then
|
|
i := 2
|
|
else if c <= $ffff then
|
|
i := 3
|
|
else if c <= $1FFFFF then
|
|
i := 4
|
|
else if c <= $3FFFFFF then
|
|
i := 5
|
|
else
|
|
i := 6;
|
|
if PtrInt(PtrUInt(Dest)) + i > DestLen then
|
|
break;
|
|
j := i - 1;
|
|
repeat
|
|
Dest[j] := AnsiChar((c and $3f) or $80);
|
|
c := c shr 6;
|
|
dec(j);
|
|
until j = 0;
|
|
Dest^ := AnsiChar(byte(c) or UTF8_TABLE.FirstByte[i]);
|
|
inc(Dest, i);
|
|
if (PtrInt(PtrUInt(Dest)) < DestLen) and
|
|
(PtrInt(PtrUInt(Source)) < SourceLen) then
|
|
continue
|
|
else
|
|
break;
|
|
until false;
|
|
if not (ccfNoTrailingZero in Flags) then
|
|
Dest^ := #0;
|
|
end;
|
|
result := PtrInt(PtrUInt(Dest)) - result;
|
|
end;
|
|
|
|
procedure RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer;
|
|
var result: TSynTempBuffer; Flags: TCharConversionFlags);
|
|
begin
|
|
if (WideChar = nil) or
|
|
(WideCharCount = 0) then
|
|
result.Init(0)
|
|
else
|
|
begin
|
|
result.Init(WideCharCount * 3);
|
|
result.Len := RawUnicodeToUtf8(
|
|
result.buf, result.len, WideChar, WideCharCount, Flags);
|
|
end;
|
|
end;
|
|
|
|
procedure RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer;
|
|
var result: RawUtf8; Flags: TCharConversionFlags);
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
begin
|
|
RawUnicodeToUtf8(WideChar, WideCharCount, tmp, Flags);
|
|
FastSetString(result, tmp.buf, tmp.len);
|
|
tmp.Done;
|
|
end;
|
|
|
|
function RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer;
|
|
Flags: TCharConversionFlags): RawUtf8;
|
|
begin
|
|
RawUnicodeToUtf8(WideChar, WideCharCount, result, Flags);
|
|
end;
|
|
|
|
function RawUnicodeToUtf8(WideChar: PWideChar; WideCharCount: integer;
|
|
out Utf8Length: integer): RawUtf8;
|
|
var
|
|
LW: integer;
|
|
begin
|
|
result := ''; // somewhat faster if result is freed before any SetLength()
|
|
if WideCharCount = 0 then
|
|
exit;
|
|
LW := WideCharCount * 3; // maximum resulting length
|
|
SetLength(result, LW);
|
|
Utf8Length := RawUnicodeToUtf8(pointer(result), LW + 1,
|
|
WideChar, WideCharCount, [ccfNoTrailingZero]);
|
|
if Utf8Length <= 0 then
|
|
result := '';
|
|
end;
|
|
|
|
procedure Utf8ToShortString(var dest: ShortString; source: PUtf8Char);
|
|
var
|
|
c: cardinal;
|
|
len, extra, i: integer;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
len := 0;
|
|
if source <> nil then
|
|
repeat
|
|
c := byte(source^);
|
|
inc(source);
|
|
if c = 0 then
|
|
break
|
|
else if c <= 127 then
|
|
begin
|
|
inc(len);
|
|
dest[len] := AnsiChar(c);
|
|
if len < 253 then
|
|
continue
|
|
else
|
|
break;
|
|
end
|
|
else
|
|
begin
|
|
extra := utf8.Lookup[c];
|
|
if extra = UTF8_INVALID then
|
|
break;
|
|
i := extra;
|
|
repeat
|
|
if byte(source^) and $c0 <> $80 then
|
|
begin
|
|
dest[0] := AnsiChar(len);
|
|
exit; // invalid UTF-8 content
|
|
end;
|
|
c := (c shl 6) + byte(source^);
|
|
inc(source);
|
|
dec(i);
|
|
until i = 0;
|
|
dec(c, utf8.Extra[extra].offset);
|
|
// #256.. -> slower but accurate conversion
|
|
inc(len);
|
|
if c > $ffff then
|
|
dest[len] := '?'
|
|
else
|
|
dest[len] := AnsiChar(WinAnsiConvert.fWideToAnsi[c]);
|
|
if len < 253 then
|
|
continue
|
|
else
|
|
break;
|
|
end;
|
|
until false;
|
|
dest[0] := AnsiChar(len);
|
|
end;
|
|
|
|
function Utf8ToWideChar(dest: PWideChar; source: PUtf8Char;
|
|
MaxDestChars, sourceBytes: PtrInt; NoTrailingZero: boolean): PtrInt;
|
|
// faster than System.Utf8ToUnicode()
|
|
var
|
|
c: cardinal;
|
|
begd: PWideChar;
|
|
endSource: PUtf8Char;
|
|
endDest: PWideChar;
|
|
i, extra: integer;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
Quit, NoSource;
|
|
begin
|
|
result := 0;
|
|
if dest = nil then
|
|
exit;
|
|
if source = nil then
|
|
goto NoSource;
|
|
if sourceBytes = 0 then
|
|
begin
|
|
if source^ = #0 then
|
|
goto NoSource;
|
|
sourceBytes := StrLen(source);
|
|
end;
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
endSource := source + sourceBytes;
|
|
endDest := dest + MaxDestChars;
|
|
begd := dest;
|
|
repeat
|
|
c := byte(source^);
|
|
inc(source);
|
|
if c <= 127 then
|
|
begin
|
|
PWord(dest)^ := c; // much faster than dest^ := WideChar(c) for FPC
|
|
inc(dest);
|
|
if (source < endSource) and
|
|
(dest < endDest) then
|
|
continue
|
|
else
|
|
break;
|
|
end;
|
|
extra := utf8.Lookup[c];
|
|
if (extra = UTF8_INVALID) or
|
|
(source + extra > endSource) then
|
|
break;
|
|
i := 0;
|
|
repeat
|
|
if byte(source^) and $c0 <> $80 then
|
|
goto Quit; // invalid input content
|
|
c := (c shl 6) + byte(source[i]);
|
|
inc(i);
|
|
until i = extra;
|
|
inc(source, extra);
|
|
with utf8.Extra[extra] do
|
|
begin
|
|
dec(c, offset);
|
|
if c < minimum then
|
|
break; // invalid input content
|
|
end;
|
|
if c <= $ffff then
|
|
begin
|
|
PWord(dest)^ := c;
|
|
inc(dest);
|
|
if (source < endSource) and
|
|
(dest < endDest) then
|
|
continue
|
|
else
|
|
break;
|
|
end;
|
|
dec(c, $10000); // store as UTF-16 surrogates
|
|
PWordArray(dest)[0] := (c shr 10) or UTF16_HISURROGATE_MIN;
|
|
PWordArray(dest)[1] := (c and $3FF) or UTF16_LOSURROGATE_MIN;
|
|
inc(dest, 2);
|
|
if (source >= endSource) or
|
|
(dest >= endDest) then
|
|
break;
|
|
until false;
|
|
Quit:
|
|
result := PtrUInt(dest) - PtrUInt(begd); // dest-begd return byte length
|
|
NoSource:
|
|
if not NoTrailingZero then
|
|
dest^ := #0; // always append a WideChar(0) to the end of the buffer
|
|
end;
|
|
|
|
function Utf8ToWideChar(dest: PWideChar; source: PUtf8Char; sourceBytes: PtrInt;
|
|
NoTrailingZero: boolean): PtrInt;
|
|
// faster than System.UTF8Decode()
|
|
var
|
|
c: cardinal;
|
|
begd: PWideChar;
|
|
endSource, endSourceBy4: PUtf8Char;
|
|
i, extra: PtrInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
quit, nosource, by1, by4;
|
|
begin
|
|
result := 0;
|
|
if dest = nil then
|
|
exit;
|
|
if source = nil then
|
|
goto nosource;
|
|
if sourceBytes = 0 then
|
|
begin
|
|
if source^ = #0 then
|
|
goto nosource;
|
|
sourceBytes := StrLen(source);
|
|
end;
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
begd := dest;
|
|
endSource := source + sourceBytes;
|
|
endSourceBy4 := endSource - 4;
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(source) and 3 = 0) and{$endif}
|
|
(source <= endSourceBy4) then
|
|
repeat // handle 7-bit ASCII chars, by quad
|
|
c := PCardinal(source)^;
|
|
if c and $80808080 <> 0 then
|
|
goto by1; // break on first non ASCII quad
|
|
by4: inc(source, 4);
|
|
PCardinal(dest)^ := (c shl 8 or (c and $FF)) and $00ff00ff;
|
|
c := c shr 16;
|
|
PCardinal(dest + 2)^ := (c shl 8 or c) and $00ff00ff;
|
|
inc(dest, 4);
|
|
until source > endSourceBy4;
|
|
if source < endSource then
|
|
repeat
|
|
by1: c := byte(source^);
|
|
inc(source);
|
|
if c <= 127 then
|
|
begin
|
|
PWord(dest)^ := c; // much faster than dest^ := WideChar(c) for FPC
|
|
inc(dest);
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(source) and 3 = 0) and{$endif}
|
|
(source <= endSourceBy4) then
|
|
begin
|
|
c := PCardinal(source)^;
|
|
if c and $80808080 = 0 then
|
|
goto by4
|
|
else
|
|
continue;
|
|
end;
|
|
if source < endSource then
|
|
continue
|
|
else
|
|
break;
|
|
end;
|
|
extra := utf8.Lookup[c];
|
|
if (extra = UTF8_INVALID) or
|
|
(source + extra > endSource) then
|
|
break;
|
|
i := 0;
|
|
repeat
|
|
if byte(source^) and $c0 <> $80 then
|
|
goto quit; // invalid input content
|
|
c := (c shl 6) + byte(source[i]);
|
|
inc(i);
|
|
until i = extra;
|
|
inc(source, extra);
|
|
with utf8.Extra[extra] do
|
|
begin
|
|
dec(c, offset);
|
|
if c < minimum then
|
|
break; // invalid input content
|
|
end;
|
|
if c <= $ffff then
|
|
begin
|
|
PWord(dest)^ := c;
|
|
inc(dest);
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(source) and 3 = 0) and{$endif}
|
|
(source <= endSourceBy4) then
|
|
begin
|
|
c := PCardinal(source)^;
|
|
if c and $80808080 = 0 then
|
|
goto by4;
|
|
continue;
|
|
end;
|
|
if source < endSource then
|
|
continue
|
|
else
|
|
break;
|
|
end;
|
|
dec(c, $10000); // store as UTF-16 surrogates
|
|
PWordArray(dest)[0] := (c shr 10) or UTF16_HISURROGATE_MIN;
|
|
PWordArray(dest)[1] := (c and $3FF) or UTF16_LOSURROGATE_MIN;
|
|
inc(dest, 2);
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(source) and 3 = 0) and{$endif}
|
|
(source <= endSourceBy4) then
|
|
begin
|
|
c := PCardinal(source)^;
|
|
if c and $80808080 = 0 then
|
|
goto by4;
|
|
continue;
|
|
end;
|
|
if source >= endSource then
|
|
break;
|
|
until false;
|
|
quit:
|
|
result := PtrUInt(dest) - PtrUInt(begd); // dest-begd returns bytes length
|
|
nosource:
|
|
if not NoTrailingZero then
|
|
dest^ := #0; // always append a WideChar(0) to the end of the buffer
|
|
end;
|
|
|
|
function IsValidUtf8Pas(source: PUtf8Char; sourcelen: PtrInt): boolean;
|
|
var
|
|
c: byte;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
done;
|
|
begin
|
|
inc(PtrUInt(sourcelen), PtrUInt(source) - 4);
|
|
if source = nil then
|
|
goto done;
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
repeat
|
|
if PtrUInt(source) <= PtrUInt(sourcelen) then
|
|
begin
|
|
if utf8.Lookup[ord(source[0])] = UTF8_ASCII then
|
|
if utf8.Lookup[ord(source[1])] = UTF8_ASCII then
|
|
if utf8.Lookup[ord(source[2])] = UTF8_ASCII then
|
|
if utf8.Lookup[ord(source[3])] = UTF8_ASCII then
|
|
begin
|
|
inc(source, 4); // optimized for JSON-like content
|
|
continue;
|
|
end
|
|
else
|
|
inc(source, 3)
|
|
else
|
|
inc(source, 2)
|
|
else
|
|
inc(source);
|
|
end
|
|
else if PtrUInt(source) >= PtrUInt(sourcelen) + 4 then
|
|
break;
|
|
c := utf8.Lookup[ord(source^)];
|
|
inc(source);
|
|
if c = UTF8_ASCII then
|
|
continue
|
|
else if c >= UTF8_INVALID then
|
|
// UTF8_INVALID=6, UTF8_ZERO=7 means unexpected end of input
|
|
break;
|
|
// c = extras -> check valid UTF-8 content
|
|
repeat
|
|
if byte(source^) and $c0 <> $80 then
|
|
goto done;
|
|
inc(source);
|
|
dec(c);
|
|
until c = 0;
|
|
until false;
|
|
done:
|
|
result := PtrUInt(source) = PtrUInt(sourcelen) + 4;
|
|
end;
|
|
|
|
function IsValidUtf8(source: PUtf8Char): boolean;
|
|
begin
|
|
result := IsValidUtf8Buffer(source, StrLen(source));
|
|
end;
|
|
|
|
function IsValidUtf8(const source: RawUtf8): boolean;
|
|
begin
|
|
result := IsValidUtf8Buffer(pointer(source), length(source));
|
|
end;
|
|
|
|
procedure DetectRawUtf8(var source: RawByteString);
|
|
begin
|
|
{$ifdef HASCODEPAGE} // do nothing on oldest Delphi
|
|
if (source <> '') and
|
|
IsValidUtf8(source) then
|
|
EnsureRawUtf8(source);
|
|
{$endif HASCODEPAGE}
|
|
end;
|
|
|
|
function IsValidUtf8WithoutControlChars(source: PUtf8Char): boolean;
|
|
var
|
|
c: byte;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
result := false;
|
|
if source <> nil then
|
|
repeat
|
|
c := byte(source^);
|
|
inc(source);
|
|
if c <= $7f then
|
|
if c < 32 then
|
|
if c = 0 then
|
|
break // reached end of input
|
|
else
|
|
exit // disallow #1..#31 control char
|
|
else
|
|
continue;
|
|
c := utf8.Lookup[c];
|
|
if c = UTF8_INVALID then
|
|
exit;
|
|
// check valid UTF-8 content
|
|
repeat
|
|
if byte(source^) and $c0 <> $80 then
|
|
exit;
|
|
inc(source);
|
|
dec(c);
|
|
until c = 0;
|
|
until false;
|
|
result := true;
|
|
end;
|
|
|
|
function IsValidUtf8WithoutControlChars(const source: RawUtf8): boolean;
|
|
var
|
|
s, len: PtrInt;
|
|
c: byte;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
result := false;
|
|
s := 1;
|
|
len := Length(source);
|
|
while s <= len do
|
|
begin
|
|
c := byte(source[s]);
|
|
inc(s);
|
|
if c < 32 then
|
|
exit // disallow #0..#31 control char
|
|
else if c > $7f then
|
|
begin
|
|
c := utf8.Lookup[c];
|
|
if c = UTF8_INVALID then
|
|
exit;
|
|
// check valid UTF-8 content
|
|
repeat
|
|
if byte(source[s]) and $c0 <> $80 then
|
|
exit;
|
|
inc(s);
|
|
dec(c);
|
|
until c = 0;
|
|
end;
|
|
end;
|
|
result := true;
|
|
end;
|
|
|
|
function Utf8ToUnicodeLength(source: PUtf8Char): PtrUInt;
|
|
var
|
|
c: byte;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
result := 0;
|
|
if source <> nil then
|
|
repeat
|
|
c := utf8.Lookup[byte(source^)];
|
|
inc(source);
|
|
if c = UTF8_ASCII then
|
|
inc(result)
|
|
else if c = UTF8_ZERO then
|
|
break
|
|
else if c = UTF8_INVALID then
|
|
exit
|
|
else
|
|
begin
|
|
inc(result, 1 + ord(c >= UTF8_EXTRA_SURROGATE));
|
|
// check valid UTF-8 content
|
|
repeat
|
|
if byte(source^) and $c0 <> $80 then
|
|
exit;
|
|
inc(source);
|
|
dec(c);
|
|
until c = 0;
|
|
end;
|
|
until false;
|
|
end;
|
|
|
|
function Utf8TruncateToUnicodeLength(var text: RawUtf8; maxUtf16: integer): boolean;
|
|
var
|
|
c: byte;
|
|
source: PUtf8Char;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
trunc;
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
source := pointer(text);
|
|
if (source <> nil) and
|
|
(cardinal(maxUtf16) < cardinal(Length(text))) then
|
|
repeat
|
|
c := utf8.Lookup[byte(source^)];
|
|
inc(source);
|
|
if c = UTF8_ASCII then
|
|
begin
|
|
dec(maxUtf16);
|
|
if maxUtf16 <> 0 then
|
|
continue;
|
|
trunc: SetLength(text, source - pointer(text));
|
|
result := true;
|
|
exit;
|
|
end
|
|
else if (c = UTF8_ZERO) or
|
|
(c = UTF8_INVALID) then
|
|
break
|
|
else
|
|
begin
|
|
dec(maxUtf16, 1 + ord(c >= UTF8_EXTRA_SURROGATE));
|
|
if maxUtf16 < 0 then
|
|
goto trunc; // not enough place for this UTF-8 codepoint
|
|
// check valid UTF-8 content
|
|
repeat
|
|
if byte(source^) and $c0 <> $80 then
|
|
break;
|
|
inc(source);
|
|
dec(c);
|
|
until c = 0;
|
|
if maxUtf16 = 0 then
|
|
goto trunc;
|
|
end;
|
|
until false;
|
|
result := false;
|
|
end;
|
|
|
|
function Utf8TruncateToLength(var text: RawUtf8; maxBytes: PtrUInt): boolean;
|
|
begin
|
|
if PtrUInt(Length(text)) < maxBytes then
|
|
begin
|
|
result := false;
|
|
exit; // nothing to truncate
|
|
end;
|
|
while (maxBytes > 0) and
|
|
(ord(text[maxBytes]) and $c0 = $80) do
|
|
dec(maxBytes);
|
|
if (maxBytes > 0) and
|
|
(text[maxBytes] > #$7f) then
|
|
dec(maxBytes);
|
|
SetLength(text, maxBytes);
|
|
result := true;
|
|
end;
|
|
|
|
function Utf8TruncatedLength(const text: RawUtf8; maxBytes: PtrUInt): PtrInt;
|
|
begin
|
|
result := Length(text);
|
|
if PtrUInt(result) < maxBytes then
|
|
exit;
|
|
result := maxBytes;
|
|
if (result = 0) or
|
|
(text[result] <= #$7f) then
|
|
exit;
|
|
while (result > 0) and
|
|
(ord(text[result]) and $c0 = $80) do
|
|
dec(result);
|
|
if (result > 0) and
|
|
(text[result] > #$7f) then
|
|
dec(result);
|
|
end;
|
|
|
|
function Utf8TruncatedLength(text: PAnsiChar; textlen, maxBytes: PtrUInt): PtrInt;
|
|
begin
|
|
result := textlen;
|
|
if textlen < maxBytes then
|
|
exit;
|
|
result := maxBytes;
|
|
while (result > 0) and
|
|
(ord(text[result]) and $c0 = $80) do
|
|
dec(result);
|
|
if (result > 0) and
|
|
(text[result] > #$7f) then
|
|
dec(result);
|
|
end;
|
|
|
|
function Utf8FirstLineToUtf16Length(source: PUtf8Char): PtrInt;
|
|
var
|
|
c: PtrUInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
result := 0;
|
|
if source <> nil then
|
|
repeat
|
|
c := byte(source^);
|
|
inc(source);
|
|
if c <= $7f then
|
|
if byte(c) in [0, 10, 13] then
|
|
break // #0, #10 or #13 stop the count
|
|
else
|
|
inc(result)
|
|
else
|
|
begin
|
|
c := utf8.Lookup[c];
|
|
if c = UTF8_INVALID then
|
|
exit; // invalid leading byte
|
|
inc(result, 1 + ord(c >= UTF8_EXTRA_SURROGATE));
|
|
inc(source, c); // a bit less safe, but faster
|
|
end;
|
|
until false;
|
|
end;
|
|
|
|
|
|
|
|
{ **************** UTF-8 / Unicode / Ansi Conversion Classes }
|
|
|
|
var
|
|
// internal list of TSynAnsiConvert instances
|
|
SynAnsiConvertList: array of TSynAnsiConvert;
|
|
SynAnsiConvertListLock: TRWLightLock;
|
|
SynAnsiConvertListCount: integer;
|
|
SynAnsiConvertListCodePage: TWordDynArray; // for fast lookup in CPU L1 cache
|
|
|
|
|
|
{ TSynAnsiConvert }
|
|
|
|
function TSynAnsiConvert.AnsiBufferToUnicode(Dest: PWideChar;
|
|
Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PWideChar;
|
|
var
|
|
c: cardinal;
|
|
begin
|
|
// first handle trailing 7-bit ASCII chars, by quad (Sha optimization)
|
|
if SourceChars >= 4 then
|
|
repeat
|
|
c := PCardinal(Source)^;
|
|
if c and $80808080 <> 0 then
|
|
break; // break on first non ASCII quad
|
|
dec(SourceChars, 4);
|
|
inc(Source, 4);
|
|
PCardinal(Dest)^ := (c shl 8 or (c and $FF)) and $00ff00ff;
|
|
c := c shr 16;
|
|
PCardinal(Dest + 2)^ := (c shl 8 or c) and $00ff00ff;
|
|
inc(Dest, 4);
|
|
until SourceChars < 4;
|
|
if (SourceChars > 0) and
|
|
(ord(Source^) < 128) then
|
|
repeat
|
|
dec(SourceChars);
|
|
PWord(Dest)^ := ord(Source^); // faster than dest^ := WideChar(c) on FPC
|
|
inc(Source);
|
|
inc(Dest);
|
|
until (SourceChars = 0) or
|
|
(ord(Source^) >= 128);
|
|
if SourceChars > 0 then
|
|
// rely on the Operating System for all remaining ASCII characters
|
|
inc(Dest,
|
|
Unicode_AnsiToWide(Source, Dest, SourceChars, SourceChars, fCodePage));
|
|
if not NoTrailingZero then
|
|
Dest^ := #0;
|
|
result := Dest;
|
|
end;
|
|
|
|
function TSynAnsiConvert.AnsiBufferToUtf8(Dest: PUtf8Char; Source: PAnsiChar;
|
|
SourceChars: cardinal; NoTrailingZero: boolean): PUtf8Char;
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
c: cardinal;
|
|
U: PWideChar;
|
|
begin
|
|
// first handle trailing 7-bit ASCII chars, by quad (Sha optimization)
|
|
if SourceChars >= 4 then
|
|
repeat
|
|
c := PCardinal(Source)^;
|
|
if c and $80808080 <> 0 then
|
|
break; // break on first non ASCII quad
|
|
PCardinal(Dest)^ := c;
|
|
dec(SourceChars, 4);
|
|
inc(Source, 4);
|
|
inc(Dest, 4);
|
|
until SourceChars < 4;
|
|
if (SourceChars > 0) and
|
|
(ord(Source^) < 128) then
|
|
repeat
|
|
Dest^ := Source^;
|
|
dec(SourceChars);
|
|
inc(Source);
|
|
inc(Dest);
|
|
until (SourceChars = 0) or
|
|
(ord(Source^) >= 128);
|
|
// rely on the Operating System for all remaining ASCII characters
|
|
if SourceChars = 0 then
|
|
result := Dest
|
|
else
|
|
begin
|
|
U := AnsiBufferToUnicode(tmp.Init(SourceChars * 3), Source, SourceChars);
|
|
result := Dest + RawUnicodeToUtf8(Dest, SourceChars * 3, tmp.buf,
|
|
(PtrUInt(U) - PtrUInt(tmp.buf)) shr 1, [ccfNoTrailingZero]);
|
|
tmp.Done;
|
|
end;
|
|
if not NoTrailingZero then
|
|
result^ := #0;
|
|
end;
|
|
|
|
// UTF-8 is AT MOST 50% bigger than UTF-16 in bytes in range U+0800..U+FFFF
|
|
// see http://stackoverflow.com/a/7008095 -> bytes=WideCharCount*3 below
|
|
|
|
{$ifndef PUREMORMOT2}
|
|
|
|
function TSynAnsiConvert.AnsiToRawUnicode(const AnsiText: RawByteString): RawUnicode;
|
|
begin
|
|
result := AnsiToRawUnicode(pointer(AnsiText), length(AnsiText));
|
|
end;
|
|
|
|
function TSynAnsiConvert.AnsiToRawUnicode(Source: PAnsiChar;
|
|
SourceChars: cardinal): RawUnicode;
|
|
var
|
|
U: PWideChar;
|
|
tmp: TSynTempBuffer;
|
|
begin
|
|
if SourceChars = 0 then
|
|
result := ''
|
|
else
|
|
begin
|
|
U := AnsiBufferToUnicode(tmp.Init(SourceChars * 2), Source, SourceChars);
|
|
U^ := #0;
|
|
SetString(result, PAnsiChar(tmp.buf), PtrUInt(U) - PtrUInt(tmp.buf) + 1);
|
|
tmp.Done;
|
|
end;
|
|
end;
|
|
|
|
{$endif PUREMORMOT2}
|
|
|
|
function TSynAnsiConvert.AnsiToUnicodeString(Source: PAnsiChar;
|
|
SourceChars: cardinal): SynUnicode;
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
U: PWideChar;
|
|
begin
|
|
if SourceChars = 0 then
|
|
result := ''
|
|
else
|
|
begin
|
|
U := AnsiBufferToUnicode(tmp.Init(SourceChars * 2), Source, SourceChars);
|
|
SetString(result, PWideChar(tmp.buf), (PtrUInt(U) - PtrUInt(tmp.buf)) shr 1);
|
|
tmp.Done;
|
|
end;
|
|
end;
|
|
|
|
function TSynAnsiConvert.AnsiToUnicodeString(const Source: RawByteString): SynUnicode;
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
U: PWideChar;
|
|
begin
|
|
if Source = '' then
|
|
result := ''
|
|
else
|
|
begin
|
|
tmp.Init(length(Source) * 2); // max dest size in bytes
|
|
U := AnsiBufferToUnicode(tmp.buf, pointer(Source), length(Source));
|
|
SetString(result, PWideChar(tmp.buf), (PtrUInt(U) - PtrUInt(tmp.buf)) shr 1);
|
|
tmp.Done;
|
|
end;
|
|
end;
|
|
|
|
function TSynAnsiConvert.AnsiToUtf8(const AnsiText: RawByteString): RawUtf8;
|
|
begin
|
|
AnsiBufferToRawUtf8(pointer(AnsiText), length(AnsiText), result);
|
|
end;
|
|
|
|
procedure TSynAnsiConvert.AnsiBufferToRawUtf8(Source: PAnsiChar;
|
|
SourceChars: cardinal; out Value: RawUtf8);
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
P: PUtf8Char;
|
|
begin
|
|
if (Source = nil) or
|
|
(SourceChars = 0) then
|
|
exit;
|
|
P := AnsiBufferToUtf8(tmp.Init(SourceChars * 3), Source, SourceChars);
|
|
FastSetString(Value, tmp.buf, P - tmp.buf);
|
|
tmp.Done;
|
|
end;
|
|
|
|
constructor TSynAnsiConvert.Create(aCodePage: cardinal);
|
|
begin
|
|
fCodePage := aCodePage;
|
|
fAnsiCharShift := 1; // default is safe
|
|
end;
|
|
|
|
function GetEngine(aCodePage: cardinal): TSynAnsiConvert;
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
SynAnsiConvertListLock.ReadLock; // concurrent read lock
|
|
i := WordScanIndex(pointer(SynAnsiConvertListCodePage),
|
|
SynAnsiConvertListCount, aCodePage); // SSE2 asm on i386 and x86_64
|
|
if i >= 0 then
|
|
result := SynAnsiConvertList[i]
|
|
else
|
|
result := nil;
|
|
SynAnsiConvertListLock.ReadUnLock;
|
|
end;
|
|
|
|
function NewEngine(aCodePage: cardinal): TSynAnsiConvert;
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
SynAnsiConvertListLock.WriteLock;
|
|
try
|
|
i := WordScanIndex(pointer(SynAnsiConvertListCodePage),
|
|
SynAnsiConvertListCount, aCodePage); // search again for thread safety
|
|
if i >= 0 then
|
|
begin
|
|
result := SynAnsiConvertList[i]; // avoid any (unlikely) race condition
|
|
exit;
|
|
end;
|
|
if aCodePage = CP_UTF8 then
|
|
result := TSynAnsiUtf8.Create(CP_UTF8)
|
|
else if aCodePage = CP_UTF16 then
|
|
result := TSynAnsiUtf16.Create(CP_UTF16)
|
|
else if IsFixedWidthCodePage(aCodePage) then
|
|
result := TSynAnsiFixedWidth.Create(aCodePage)
|
|
else
|
|
result := TSynAnsiConvert.Create(aCodePage);
|
|
RegisterGlobalShutdownRelease(result);
|
|
ObjArrayAdd(SynAnsiConvertList, result);
|
|
AddWord(SynAnsiConvertListCodePage, SynAnsiConvertListCount, aCodePage);
|
|
finally
|
|
SynAnsiConvertListLock.WriteUnLock;
|
|
end;
|
|
end;
|
|
|
|
class function TSynAnsiConvert.Engine(aCodePage: cardinal): TSynAnsiConvert;
|
|
begin
|
|
if aCodePage <> CP_ACP then
|
|
begin
|
|
result := GetEngine(aCodePage);
|
|
if result = nil then
|
|
if aCodePage = CP_RAWBLOB then
|
|
result := RawByteStringConvert // CP_RAWBLOB is internal -> no engine
|
|
else
|
|
result := NewEngine(aCodePage)
|
|
end
|
|
else
|
|
result := CurrentAnsiConvert;
|
|
end;
|
|
|
|
function TSynAnsiConvert.UnicodeBufferToAnsi(Dest: PAnsiChar;
|
|
Source: PWideChar; SourceChars: cardinal): PAnsiChar;
|
|
var
|
|
c: cardinal;
|
|
begin
|
|
if (Source <> nil) and
|
|
(SourceChars <> 0) then
|
|
begin
|
|
// ignore any trailing BOM (do exist on Windows files)
|
|
if Source^ = BOM_UTF16LE then
|
|
begin
|
|
inc(Source);
|
|
dec(SourceChars);
|
|
end;
|
|
// first handle trailing 7-bit ASCII chars, by pairs (Sha optimization)
|
|
if SourceChars >= 2 then
|
|
repeat
|
|
c := PCardinal(Source)^;
|
|
if c and $ff80ff80 <> 0 then
|
|
break; // break on first non ASCII pair
|
|
dec(SourceChars, 2);
|
|
inc(Source, 2);
|
|
c := c shr 8 or c;
|
|
PWord(Dest)^ := c;
|
|
inc(Dest, 2);
|
|
until SourceChars < 2;
|
|
if (SourceChars > 0) and
|
|
(ord(Source^) < 128) then
|
|
repeat
|
|
Dest^ := AnsiChar(ord(Source^));
|
|
dec(SourceChars);
|
|
inc(Source);
|
|
inc(Dest);
|
|
until (SourceChars = 0) or
|
|
(ord(Source^) >= 128);
|
|
// rely on the Operating System for all remaining ASCII characters
|
|
if SourceChars <> 0 then
|
|
inc(Dest,
|
|
Unicode_WideToAnsi(Source, Dest, SourceChars, SourceChars * 3, fCodePage));
|
|
end;
|
|
result := Dest;
|
|
end;
|
|
|
|
function TSynAnsiConvert.Utf8BufferToAnsi(Dest: PAnsiChar;
|
|
Source: PUtf8Char; SourceChars: cardinal): PAnsiChar;
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
begin
|
|
if (Source = nil) or
|
|
(SourceChars = 0) then
|
|
result := Dest
|
|
else
|
|
begin
|
|
tmp.Init((SourceChars + 1) shl fAnsiCharShift);
|
|
result := UnicodeBufferToAnsi(Dest, tmp.buf,
|
|
Utf8ToWideChar(tmp.buf, Source, SourceChars) shr 1);
|
|
tmp.Done;
|
|
end;
|
|
end;
|
|
|
|
function TSynAnsiConvert.Utf8BufferToAnsi(
|
|
Source: PUtf8Char; SourceChars: cardinal): RawByteString;
|
|
begin
|
|
Utf8BufferToAnsi(Source, SourceChars, result);
|
|
end;
|
|
|
|
procedure TSynAnsiConvert.Utf8BufferToAnsi(Source: PUtf8Char; SourceChars: cardinal;
|
|
var result: RawByteString);
|
|
var
|
|
tmp: array[word] of AnsiChar;
|
|
max: PtrInt;
|
|
begin
|
|
if (Source = nil) or
|
|
(SourceChars = 0) then
|
|
result := ''
|
|
else
|
|
begin
|
|
max := (SourceChars + 1) shl fAnsiCharShift;
|
|
if max < SizeOf(tmp) then
|
|
// use a temporary stack buffer up to 64KB
|
|
FastSetStringCP(result, @tmp,
|
|
Utf8BufferToAnsi(@tmp, Source, SourceChars) - PAnsiChar(@tmp), fCodePage)
|
|
else
|
|
begin
|
|
// huge strings will be allocated once and truncated, not resized
|
|
FastSetStringCP(result, nil, max, fCodePage);
|
|
FakeLength(result,
|
|
Utf8BufferToAnsi(pointer(result), Source, SourceChars) - pointer(result));
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
function TSynAnsiConvert.Utf8ToAnsi(const u: RawUtf8): RawByteString;
|
|
begin
|
|
Utf8BufferToAnsi(pointer(u), length(u), result);
|
|
end;
|
|
|
|
function TSynAnsiConvert.Utf8ToAnsiBuffer2K(const S: RawUtf8;
|
|
Dest: PAnsiChar; DestSize: integer): integer;
|
|
var
|
|
tmp: array[0..2047] of AnsiChar; // truncated to 2KB as documented
|
|
begin
|
|
if (DestSize <= 0) or
|
|
(Dest = nil) then
|
|
begin
|
|
result := 0;
|
|
exit;
|
|
end;
|
|
result := length(S);
|
|
if result > 0 then
|
|
begin
|
|
if result > SizeOf(tmp) then
|
|
result := SizeOf(tmp);
|
|
result := Utf8BufferToAnsi(tmp{%H-}, pointer(S), result) - {%H-}tmp;
|
|
if result >= DestSize then
|
|
result := DestSize - 1;
|
|
MoveFast(tmp, Dest^, result);
|
|
end;
|
|
Dest[result] := #0;
|
|
end;
|
|
|
|
function TSynAnsiConvert.UnicodeBufferToAnsi(Source: PWideChar;
|
|
SourceChars: cardinal): RawByteString;
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
begin
|
|
if (Source = nil) or
|
|
(SourceChars = 0) then
|
|
result := ''
|
|
else
|
|
begin
|
|
tmp.Init(SourceChars * 3);
|
|
FastSetStringCP(result, tmp.buf, UnicodeBufferToAnsi(
|
|
tmp.buf, Source, SourceChars) - PAnsiChar(tmp.buf), fCodePage);
|
|
tmp.Done;
|
|
end;
|
|
end;
|
|
|
|
function TSynAnsiConvert.UnicodeStringToAnsi(const Source: SynUnicode): RawByteString;
|
|
begin
|
|
result := UnicodeBufferToAnsi(pointer(Source), length(Source));
|
|
end;
|
|
|
|
{$ifndef PUREMORMOT2}
|
|
function TSynAnsiConvert.RawUnicodeToAnsi(const Source: RawUnicode): RawByteString;
|
|
begin
|
|
result := UnicodeBufferToAnsi(pointer(Source), length(Source) shr 1);
|
|
end;
|
|
{$endif PUREMORMOT2}
|
|
|
|
function TSynAnsiConvert.AnsiToAnsi(From: TSynAnsiConvert;
|
|
const Source: RawByteString): RawByteString;
|
|
begin
|
|
if From = self then
|
|
result := Source
|
|
else
|
|
result := AnsiToAnsi(From, pointer(Source), length(Source));
|
|
end;
|
|
|
|
function TSynAnsiConvert.AnsiToAnsi(From: TSynAnsiConvert;
|
|
Source: PAnsiChar; SourceChars: cardinal): RawByteString;
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
U: PWideChar;
|
|
begin
|
|
if From.fCodePage = fCodePage then
|
|
FastSetStringCP(result, Source, SourceChars, fCodePage)
|
|
else if (Source = nil) or
|
|
(SourceChars = 0) then
|
|
result := ''
|
|
else
|
|
begin
|
|
U := tmp.Init(SourceChars * 2 + 2);
|
|
result := UnicodeBufferToAnsi(U,
|
|
From.AnsiBufferToUnicode(U, Source, SourceChars) - U);
|
|
tmp.Done;
|
|
end;
|
|
end;
|
|
|
|
|
|
{ TSynAnsiFixedWidth }
|
|
|
|
function TSynAnsiFixedWidth.AnsiBufferToUnicode(Dest: PWideChar;
|
|
Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PWideChar;
|
|
var
|
|
i: integer;
|
|
tab: PWordArray;
|
|
begin
|
|
// PWord*(Dest)[] is much faster than dest^ := WideChar(c) for FPC
|
|
tab := pointer(fAnsiToWide);
|
|
for i := 1 to SourceChars shr 2 do
|
|
begin
|
|
PWordArray(Dest)[0] := tab[Ord(Source[0])];
|
|
PWordArray(Dest)[1] := tab[Ord(Source[1])];
|
|
PWordArray(Dest)[2] := tab[Ord(Source[2])];
|
|
PWordArray(Dest)[3] := tab[Ord(Source[3])];
|
|
inc(Source, 4);
|
|
inc(Dest, 4);
|
|
end;
|
|
for i := 1 to SourceChars and 3 do
|
|
begin
|
|
PWord(Dest)^ := tab[Ord(Source^)];
|
|
inc(Dest);
|
|
inc(Source);
|
|
end;
|
|
if not NoTrailingZero then
|
|
Dest^ := #0;
|
|
result := Dest;
|
|
end;
|
|
|
|
function TSynAnsiFixedWidth.AnsiBufferToUtf8(Dest: PUtf8Char;
|
|
Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PUtf8Char;
|
|
var
|
|
EndSource, EndSourceBy4: PAnsiChar;
|
|
c: cardinal;
|
|
label
|
|
by4, by1; // ugly but faster
|
|
begin
|
|
if (self = nil) or
|
|
(Dest = nil) then
|
|
begin
|
|
result := nil;
|
|
Exit;
|
|
end
|
|
else if (Source <> nil) and
|
|
(SourceChars > 0) then
|
|
begin
|
|
// handle 7-bit ASCII WideChars, by quads
|
|
EndSource := Source + SourceChars;
|
|
EndSourceBy4 := EndSource - 4;
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif}
|
|
(Source <= EndSourceBy4) then
|
|
repeat
|
|
c := PCardinal(Source)^;
|
|
if c and $80808080 <> 0 then
|
|
goto by1; // break on first non ASCII quad
|
|
by4: inc(Source, 4);
|
|
PCardinal(Dest)^ := c;
|
|
inc(Dest, 4);
|
|
until Source > EndSourceBy4;
|
|
// generic loop, handling one WideChar per iteration
|
|
if Source < EndSource then
|
|
repeat
|
|
by1: c := byte(Source^);
|
|
inc(Source);
|
|
if c <= $7F then
|
|
begin
|
|
Dest^ := AnsiChar(c); // 0..127 don't need any translation
|
|
Inc(Dest);
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif}
|
|
(Source <= EndSourceBy4) then
|
|
begin
|
|
c := PCardinal(Source)^;
|
|
if c and $80808080 = 0 then
|
|
goto by4;
|
|
continue;
|
|
end;
|
|
if Source < EndSource then
|
|
continue
|
|
else
|
|
break;
|
|
end
|
|
else
|
|
begin
|
|
// no surrogate is expected in TSynAnsiFixedWidth charsets
|
|
c := fAnsiToWide[c]; // convert FixedAnsi char into Unicode char
|
|
if c > $7ff then
|
|
begin
|
|
Dest[0] := AnsiChar($E0 or (c shr 12));
|
|
Dest[1] := AnsiChar($80 or ((c shr 6) and $3F));
|
|
Dest[2] := AnsiChar($80 or (c and $3F));
|
|
Inc(Dest, 3);
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif}
|
|
(Source <= EndSourceBy4) then
|
|
begin
|
|
c := PCardinal(Source)^;
|
|
if c and $80808080 = 0 then
|
|
goto by4;
|
|
continue;
|
|
end;
|
|
if Source < EndSource then
|
|
continue
|
|
else
|
|
break;
|
|
end
|
|
else
|
|
begin
|
|
Dest[0] := AnsiChar($C0 or (c shr 6));
|
|
Dest[1] := AnsiChar($80 or (c and $3F));
|
|
Inc(Dest, 2);
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif}
|
|
(Source < EndSourceBy4) then
|
|
begin
|
|
c := PCardinal(Source)^;
|
|
if c and $80808080 = 0 then
|
|
goto by4;
|
|
continue;
|
|
end;
|
|
if Source < EndSource then
|
|
continue
|
|
else
|
|
break;
|
|
end;
|
|
end;
|
|
until false;
|
|
end;
|
|
if not NoTrailingZero then
|
|
Dest^ := #0;
|
|
{$ifdef ISDELPHI104}
|
|
exit(Dest); // circumvent Delphi 10.4 optimizer bug
|
|
{$else}
|
|
result := Dest;
|
|
{$endif ISDELPHI104}
|
|
end;
|
|
|
|
{$ifndef PUREMORMOT2}
|
|
function TSynAnsiFixedWidth.AnsiToRawUnicode(Source: PAnsiChar;
|
|
SourceChars: cardinal): RawUnicode;
|
|
begin
|
|
if SourceChars = 0 then
|
|
result := ''
|
|
else
|
|
begin
|
|
SetString(result, nil, SourceChars * 2 + 1);
|
|
AnsiBufferToUnicode(pointer(result), Source, SourceChars);
|
|
end;
|
|
end;
|
|
{$endif PUREMORMOT2}
|
|
|
|
const
|
|
/// reference set for WinAnsi to Unicode conversion
|
|
// - this table contains all the Unicode codepoints corresponding to
|
|
// the Ansi Code Page 1252 (i.e. WinAnsi), which Unicode value are > 255
|
|
// - values taken from MultiByteToWideChar(1252,0,@Tmp,256,@WinAnsiTable,256)
|
|
// so are available outside the Windows platforms (e.g. Linux/BSD) and even
|
|
// if the system has been tweaked as such:
|
|
// http://www.fas.harvard.edu/~chgis/data/chgis/downloads/v4/howto/cyrillic.html
|
|
WinAnsiUnicodeChars: packed array[128..159] of word = (
|
|
8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338,
|
|
141, 381, 143, 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482,
|
|
353, 8250, 339, 157, 382, 376);
|
|
|
|
constructor TSynAnsiFixedWidth.Create(aCodePage: cardinal);
|
|
var
|
|
i, len: PtrInt;
|
|
A256: array[0..255] of AnsiChar;
|
|
U256: array[0..255] of WideChar;
|
|
begin
|
|
inherited;
|
|
if not IsFixedWidthCodePage(aCodePage) then
|
|
// warning: CreateUtf8() uses Utf8ToString() -> call CreateFmt() here
|
|
raise ESynUnicode.CreateFmt('%s.Create - Invalid code page %d',
|
|
[ClassNameShort(self)^, fCodePage]);
|
|
// create internal look-up tables
|
|
SetLength(fAnsiToWide, 256);
|
|
if (aCodePage = CP_WINANSI) or
|
|
(aCodePage = CP_LATIN1) or
|
|
(aCodePage >= CP_RAWBLOB) then
|
|
begin
|
|
// Win1252 has its own table, LATIN1 and RawByteString map 8-bit Unicode
|
|
for i := 0 to 255 do
|
|
fAnsiToWide[i] := i;
|
|
if aCodePage = CP_WINANSI then
|
|
// do not trust the Windows API for the 1252 code page :(
|
|
for i := low(WinAnsiUnicodeChars) to high(WinAnsiUnicodeChars) do
|
|
fAnsiToWide[i] := WinAnsiUnicodeChars[i];
|
|
end
|
|
else
|
|
begin
|
|
// initialize table from Operating System returned values
|
|
for i := 0 to 255 do
|
|
A256[i] := AnsiChar(i);
|
|
FillcharFast(U256, SizeOf(U256), 0);
|
|
// call mormot.core.os cross-platform Unicode_AnsiToWide()
|
|
len := PtrUInt(inherited AnsiBufferToUnicode(U256, A256, 256)) - PtrUInt(@U256);
|
|
if (len < 500) or
|
|
(len > 512) then
|
|
// warning: CreateUtf8() uses Utf8ToString() -> call CreateFmt() now
|
|
raise ESynUnicode.CreateFmt('OS error for %s.Create(%d) [%d]',
|
|
[ClassNameShort(self)^, aCodePage, len]);
|
|
MoveFast(U256[0], fAnsiToWide[0], 512);
|
|
end;
|
|
SetLength(fWideToAnsi, 65536);
|
|
for i := 1 to 126 do
|
|
fWideToAnsi[i] := i;
|
|
FillcharFast(fWideToAnsi[127], 65536 - 127, ord('?')); // '?' for unknown char
|
|
for i := 127 to 255 do
|
|
if (fAnsiToWide[i] <> 0) and
|
|
(fAnsiToWide[i] <> ord('?')) then
|
|
fWideToAnsi[fAnsiToWide[i]] := i;
|
|
// fixed width Ansi will never be bigger than UTF-8
|
|
fAnsiCharShift := 0;
|
|
end;
|
|
|
|
function TSynAnsiFixedWidth.IsValidAnsi(WideText: PWideChar; Length: PtrInt): boolean;
|
|
var
|
|
i: PtrInt;
|
|
wc: PtrUInt;
|
|
begin
|
|
result := false;
|
|
if WideText <> nil then
|
|
for i := 0 to Length - 1 do
|
|
begin
|
|
wc := PtrUInt(WideText[i]);
|
|
if wc = 0 then
|
|
break
|
|
else if wc < 256 then
|
|
if fAnsiToWide[wc] < 256 then
|
|
continue
|
|
else
|
|
exit
|
|
else if fWideToAnsi[wc] = ord('?') then
|
|
exit
|
|
else
|
|
continue;
|
|
end;
|
|
result := true;
|
|
end;
|
|
|
|
function TSynAnsiFixedWidth.IsValidAnsi(WideText: PWideChar): boolean;
|
|
var
|
|
wc: PtrUInt;
|
|
begin
|
|
result := false;
|
|
if WideText <> nil then
|
|
repeat
|
|
wc := PtrUInt(WideText^);
|
|
inc(WideText);
|
|
if wc = 0 then
|
|
break
|
|
else if wc < 256 then
|
|
if fAnsiToWide[wc] < 256 then
|
|
continue
|
|
else
|
|
exit
|
|
else if fWideToAnsi[wc] = ord('?') then
|
|
exit
|
|
else
|
|
continue;
|
|
until false;
|
|
result := true;
|
|
end;
|
|
|
|
function TSynAnsiFixedWidth.IsValidAnsiU(Utf8Text: PUtf8Char): boolean;
|
|
var
|
|
extra: byte;
|
|
c, n: cardinal;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
result := false;
|
|
if Utf8Text <> nil then
|
|
repeat
|
|
extra := utf8.Lookup[ord(Utf8Text^)];
|
|
inc(Utf8Text);
|
|
if extra = UTF8_ASCII then
|
|
continue
|
|
else if extra = UTF8_ZERO then
|
|
break
|
|
else if extra = UTF8_INVALID then
|
|
exit
|
|
else
|
|
begin
|
|
if utf8.Extra[extra].minimum > $ffff then
|
|
exit;
|
|
n := extra;
|
|
c := ord(Utf8Text[-1]);
|
|
repeat
|
|
if byte(Utf8Text^) and $c0 <> $80 then
|
|
exit; // invalid UTF-8 content
|
|
c := (c shl 6) + byte(Utf8Text^);
|
|
inc(Utf8Text);
|
|
dec(n)
|
|
until n = 0;
|
|
dec(c, utf8.Extra[extra].offset);
|
|
if (c > $ffff) or
|
|
(fWideToAnsi[c] = ord('?')) then
|
|
exit; // invalid char in the WinAnsi code page
|
|
end;
|
|
until false;
|
|
result := true;
|
|
end;
|
|
|
|
function TSynAnsiFixedWidth.IsValidAnsiU8Bit(Utf8Text: PUtf8Char): boolean;
|
|
var
|
|
extra: byte;
|
|
c, n: cardinal;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
result := false;
|
|
if Utf8Text <> nil then
|
|
repeat
|
|
extra := utf8.Lookup[ord(Utf8Text^)];
|
|
inc(Utf8Text);
|
|
if extra = UTF8_ASCII then
|
|
continue
|
|
else if extra = UTF8_ZERO then
|
|
break
|
|
else if extra = UTF8_INVALID then
|
|
exit
|
|
else
|
|
begin
|
|
if utf8.Extra[extra].minimum > $ffff then
|
|
exit;
|
|
n := extra;
|
|
c := ord(Utf8Text[-1]);
|
|
repeat
|
|
if byte(Utf8Text^) and $c0 <> $80 then
|
|
exit; // invalid UTF-8 content
|
|
c := (c shl 6) + byte(Utf8Text^);
|
|
inc(Utf8Text);
|
|
dec(n)
|
|
until n = 0;
|
|
dec(c, utf8.Extra[extra].offset);
|
|
if (c > 255) or
|
|
(fAnsiToWide[c] > 255) then
|
|
exit; // not 8-bit char (like "tm" or such) is marked invalid
|
|
end;
|
|
until false;
|
|
result := true;
|
|
end;
|
|
|
|
function TSynAnsiFixedWidth.UnicodeBufferToAnsi(Dest: PAnsiChar;
|
|
Source: PWideChar; SourceChars: cardinal): PAnsiChar;
|
|
var
|
|
c: cardinal;
|
|
tab: PAnsiChar;
|
|
begin
|
|
if (Source <> nil) and
|
|
(SourceChars <> 0) then
|
|
begin
|
|
// ignore any trailing BOM (do exist on Windows files)
|
|
if Source^ = BOM_UTF16LE then
|
|
begin
|
|
inc(Source);
|
|
dec(SourceChars);
|
|
end;
|
|
// first handle trailing 7-bit ASCII chars, by pairs (Sha optimization)
|
|
if SourceChars >= 2 then
|
|
repeat
|
|
c := PCardinal(Source)^;
|
|
if c and $ff80ff80 <> 0 then
|
|
break; // break on first non ASCII pair
|
|
dec(SourceChars, 2);
|
|
inc(Source, 2);
|
|
c := c shr 8 or c;
|
|
PWord(Dest)^ := c;
|
|
inc(Dest, 2);
|
|
until SourceChars < 2;
|
|
// use internal lookup tables for fast process of remaining chars
|
|
tab := pointer(fWideToAnsi);
|
|
for c := 1 to SourceChars shr 2 do
|
|
begin
|
|
Dest[0] := tab[Ord(Source[0])];
|
|
Dest[1] := tab[Ord(Source[1])];
|
|
Dest[2] := tab[Ord(Source[2])];
|
|
Dest[3] := tab[Ord(Source[3])];
|
|
inc(Source, 4);
|
|
inc(Dest, 4);
|
|
end;
|
|
for c := 1 to SourceChars and 3 do
|
|
begin
|
|
Dest^ := tab[Ord(Source^)];
|
|
inc(Dest);
|
|
inc(Source);
|
|
end;
|
|
end;
|
|
result := Dest;
|
|
end;
|
|
|
|
function TSynAnsiFixedWidth.Utf8BufferToAnsi(Dest: PAnsiChar;
|
|
Source: PUtf8Char; SourceChars: cardinal): PAnsiChar;
|
|
var
|
|
c: cardinal;
|
|
endSource, endSourceBy4: PUtf8Char;
|
|
i, extra: PtrInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
by1, by4, quit; // ugly but faster
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
// first handle trailing 7-bit ASCII chars, by quad (Sha optimization)
|
|
endSource := Source + SourceChars;
|
|
endSourceBy4 := endSource - 4;
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif}
|
|
(Source <= endSourceBy4) then
|
|
repeat
|
|
c := PCardinal(Source)^;
|
|
if c and $80808080 <> 0 then
|
|
goto by1; // break on first non ASCII quad
|
|
by4: PCardinal(Dest)^ := c;
|
|
inc(Source, 4);
|
|
inc(Dest, 4);
|
|
until Source > endSourceBy4;
|
|
// generic loop, handling one UTF-8 code per iteration
|
|
if Source < endSource then
|
|
begin
|
|
repeat
|
|
by1: c := byte(Source^);
|
|
inc(Source);
|
|
if ord(c) <= 127 then
|
|
begin
|
|
Dest^ := AnsiChar(c);
|
|
inc(Dest);
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif}
|
|
(Source <= endSourceBy4) then
|
|
begin
|
|
c := PCardinal(Source)^;
|
|
if c and $80808080 = 0 then
|
|
goto by4;
|
|
continue;
|
|
end;
|
|
if Source < endSource then
|
|
continue
|
|
else
|
|
break;
|
|
end
|
|
else
|
|
begin
|
|
extra := utf8.Lookup[c];
|
|
if (extra = UTF8_INVALID) or
|
|
(Source + extra > endSource) then
|
|
break;
|
|
i := extra;
|
|
repeat
|
|
if byte(Source^) and $c0 <> $80 then
|
|
goto quit; // invalid UTF-8 content
|
|
c := (c shl 6) + byte(Source^);
|
|
inc(Source);
|
|
dec(i);
|
|
until i = 0;
|
|
dec(c, utf8.Extra[extra].offset);
|
|
if c > $ffff then
|
|
Dest^ := '?' // '?' as in unknown fWideToAnsi[] items
|
|
else
|
|
Dest^ := AnsiChar(fWideToAnsi[c]);
|
|
inc(Dest);
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif}
|
|
(Source <= endSourceBy4) then
|
|
begin
|
|
c := PCardinal(Source)^;
|
|
if c and $80808080 = 0 then
|
|
goto by4;
|
|
continue;
|
|
end;
|
|
if Source < endSource then
|
|
continue
|
|
else
|
|
break;
|
|
end;
|
|
until false;
|
|
end;
|
|
quit:
|
|
result := Dest;
|
|
end;
|
|
|
|
function TSynAnsiFixedWidth.WideCharToAnsiChar(wc: cardinal): integer;
|
|
begin
|
|
if wc < 256 then
|
|
if fAnsiToWide[wc] < 256 then
|
|
result := wc
|
|
else
|
|
result := -1
|
|
else if wc <= 65535 then
|
|
begin
|
|
result := fWideToAnsi[wc];
|
|
if result = ord('?') then
|
|
result := -1;
|
|
end
|
|
else
|
|
result := -1;
|
|
end;
|
|
|
|
|
|
{ TSynAnsiUtf8 }
|
|
|
|
function TSynAnsiUtf8.AnsiBufferToUnicode(Dest: PWideChar;
|
|
Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PWideChar;
|
|
begin
|
|
result := Dest + (Utf8ToWideChar(Dest,
|
|
PUtf8Char(Source), SourceChars, NoTrailingZero) shr 1);
|
|
end;
|
|
|
|
function TSynAnsiUtf8.AnsiBufferToUtf8(Dest: PUtf8Char;
|
|
Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PUtf8Char;
|
|
begin
|
|
MoveFast(Source^, Dest^, SourceChars);
|
|
if not NoTrailingZero then
|
|
Dest[SourceChars] := #0;
|
|
result := Dest + SourceChars;
|
|
end;
|
|
|
|
{$ifndef PUREMORMOT2}
|
|
function TSynAnsiUtf8.AnsiToRawUnicode(Source: PAnsiChar;
|
|
SourceChars: cardinal): RawUnicode;
|
|
begin
|
|
result := Utf8DecodeToRawUniCode(PUtf8Char(Source), SourceChars);
|
|
end;
|
|
{$endif PUREMORMOT2}
|
|
|
|
constructor TSynAnsiUtf8.Create(aCodePage: cardinal);
|
|
begin
|
|
if aCodePage <> CP_UTF8 then
|
|
raise ESynUnicode.CreateFmt('%s.Create(%d)', [ClassNameShort(self)^, aCodePage]);
|
|
inherited Create(aCodePage);
|
|
end;
|
|
|
|
function TSynAnsiUtf8.UnicodeBufferToAnsi(Dest: PAnsiChar;
|
|
Source: PWideChar; SourceChars: cardinal): PAnsiChar;
|
|
begin
|
|
result := Dest + RawUnicodeToUtf8(PUtf8Char(Dest), SourceChars * 3,
|
|
Source, SourceChars, [ccfNoTrailingZero]);
|
|
end;
|
|
|
|
function TSynAnsiUtf8.UnicodeBufferToAnsi(Source: PWideChar;
|
|
SourceChars: cardinal): RawByteString;
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
begin
|
|
if (Source = nil) or
|
|
(SourceChars = 0) then
|
|
result := ''
|
|
else
|
|
begin
|
|
tmp.Init(SourceChars * 3);
|
|
FastSetStringCP(result, tmp.buf, RawUnicodeToUtf8(tmp.buf,
|
|
SourceChars * 3, Source, SourceChars, [ccfNoTrailingZero]), fCodePage);
|
|
tmp.Done;
|
|
end;
|
|
end;
|
|
|
|
function TSynAnsiUtf8.Utf8BufferToAnsi(Dest: PAnsiChar;
|
|
Source: PUtf8Char; SourceChars: cardinal): PAnsiChar;
|
|
begin
|
|
MoveFast(Source^, Dest^, SourceChars);
|
|
result := Dest + SourceChars;
|
|
end;
|
|
|
|
procedure TSynAnsiUtf8.Utf8BufferToAnsi(Source: PUtf8Char; SourceChars: cardinal;
|
|
var result: RawByteString);
|
|
begin
|
|
FastSetString(RawUtf8(result), Source, SourceChars);
|
|
end;
|
|
|
|
function TSynAnsiUtf8.Utf8ToAnsi(const u: RawUtf8): RawByteString;
|
|
begin
|
|
result := u; // may be read-only: no FastAssignUtf8/FakeCodePage
|
|
EnsureRawUtf8(result);
|
|
end;
|
|
|
|
function TSynAnsiUtf8.AnsiToUtf8(const AnsiText: RawByteString): RawUtf8;
|
|
begin
|
|
result := AnsiText; // may be read-only: no FastAssignUtf8/FakeCodePage
|
|
EnsureRawUtf8(result);
|
|
end;
|
|
|
|
procedure TSynAnsiUtf8.AnsiBufferToRawUtf8(
|
|
Source: PAnsiChar; SourceChars: cardinal; out Value: RawUtf8);
|
|
begin
|
|
FastSetString(Value, Source, SourceChars);
|
|
end;
|
|
|
|
|
|
{ TSynAnsiUtf16 }
|
|
|
|
function TSynAnsiUtf16.AnsiBufferToUnicode(Dest: PWideChar;
|
|
Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PWideChar;
|
|
begin
|
|
MoveFast(Source^, Dest^, SourceChars);
|
|
result := Pointer(PtrUInt(Dest) + SourceChars);
|
|
if not NoTrailingZero then
|
|
result^ := #0;
|
|
end;
|
|
|
|
const
|
|
NOTRAILING: array[boolean] of TCharConversionFlags = (
|
|
[], [ccfNoTrailingZero]);
|
|
|
|
function TSynAnsiUtf16.AnsiBufferToUtf8(Dest: PUtf8Char;
|
|
Source: PAnsiChar; SourceChars: cardinal; NoTrailingZero: boolean): PUtf8Char;
|
|
begin
|
|
SourceChars := SourceChars shr 1; // from byte count to WideChar count
|
|
result := Dest + RawUnicodeToUtf8(Dest,
|
|
SourceChars * 3, PWideChar(Source), SourceChars, NOTRAILING[NoTrailingZero]);
|
|
end;
|
|
|
|
{$ifndef PUREMORMOT2}
|
|
function TSynAnsiUtf16.AnsiToRawUnicode(Source: PAnsiChar;
|
|
SourceChars: cardinal): RawUnicode;
|
|
begin
|
|
SetString(result, Source, SourceChars); // byte count
|
|
end;
|
|
{$endif PUREMORMOT2}
|
|
|
|
constructor TSynAnsiUtf16.Create(aCodePage: cardinal);
|
|
begin
|
|
if aCodePage <> CP_UTF16 then
|
|
raise ESynUnicode.CreateFmt('%s.Create(%d)', [ClassNameShort(self)^, aCodePage]);
|
|
inherited Create(aCodePage);
|
|
end;
|
|
|
|
function TSynAnsiUtf16.UnicodeBufferToAnsi(Dest: PAnsiChar;
|
|
Source: PWideChar; SourceChars: cardinal): PAnsiChar;
|
|
begin
|
|
SourceChars := SourceChars shl 1; // from WideChar count to byte count
|
|
MoveFast(Source^, Dest^, SourceChars);
|
|
result := Dest + SourceChars;
|
|
end;
|
|
|
|
function TSynAnsiUtf16.Utf8BufferToAnsi(Dest: PAnsiChar;
|
|
Source: PUtf8Char; SourceChars: cardinal): PAnsiChar;
|
|
begin
|
|
result := Dest + Utf8ToWideChar(PWideChar(Dest), Source, SourceChars, true);
|
|
end;
|
|
|
|
|
|
function IsFixedWidthCodePage(aCodePage: cardinal): boolean;
|
|
begin
|
|
result := ((aCodePage >= 1250) and
|
|
(aCodePage <= 1258)) or
|
|
(aCodePage = CP_LATIN1) or
|
|
(aCodePage >= CP_RAWBLOB);
|
|
end;
|
|
|
|
function CodePageToText(aCodePage: cardinal): TShort16;
|
|
begin
|
|
case aCodePage of
|
|
CP_UTF8:
|
|
result := 'utf8';
|
|
CODEPAGE_US:
|
|
result := 'WinAnsi';
|
|
else
|
|
begin
|
|
PCardinal(@result)^ := 2 + ord('c') shl 8 + ord('p') shl 16;
|
|
AppendShortCardinal(aCodePage, result);
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
|
|
{ *************** Text File Loading with BOM/Unicode Support }
|
|
|
|
function BomFile(var Buffer: pointer; var BufferSize: PtrInt): TBomFile;
|
|
begin
|
|
result := bomNone;
|
|
if (Buffer <> nil) and
|
|
(BufferSize >= 2) then
|
|
case PWord(Buffer)^ of
|
|
ord(BOM_UTF16LE):
|
|
begin
|
|
inc(PByte(Buffer), 2);
|
|
dec(BufferSize, 2);
|
|
result := bomUnicode; // UTF-16 LE
|
|
end;
|
|
$BBEF:
|
|
if (BufferSize >= 3) and
|
|
(PByteArray(Buffer)[2] = $BF) then
|
|
begin
|
|
inc(PByte(Buffer), 3);
|
|
dec(BufferSize, 3);
|
|
result := bomUtf8;
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
function StringFromBomFile(const FileName: TFileName; out FileContent: RawByteString;
|
|
out Buffer: pointer; out BufferSize: PtrInt): TBomFile;
|
|
begin
|
|
FileContent := StringFromFile(FileName);
|
|
Buffer := pointer(FileContent);
|
|
BufferSize := length(FileContent);
|
|
result := BomFile(Buffer, BufferSize);
|
|
end;
|
|
|
|
function RawUtf8FromFile(const FileName: TFileName): RawUtf8;
|
|
begin
|
|
result := AnyTextFileToRawUtf8(FileName, {AssumeUtf8IfNoBom=}true);
|
|
end;
|
|
|
|
function AnyTextFileToRawUtf8(const FileName: TFileName; AssumeUtf8IfNoBom: boolean): RawUtf8;
|
|
var
|
|
tmp: RawByteString;
|
|
buf: pointer;
|
|
len: PtrInt;
|
|
begin
|
|
case StringFromBomFile(FileName, tmp, buf, len) of
|
|
bomNone:
|
|
if AssumeUtf8IfNoBom then
|
|
FastAssignUtf8(result, tmp)
|
|
else
|
|
CurrentAnsiConvert.AnsiBufferToRawUtf8(buf, len, result);
|
|
bomUnicode:
|
|
RawUnicodeToUtf8(PWideChar(buf), len shr 1, result);
|
|
bomUtf8:
|
|
if len = 0 then
|
|
result := ''
|
|
else
|
|
begin
|
|
MoveFast(buf^, pointer(tmp)^, len); // fast in-place delete(bom)
|
|
FakeLength(tmp, len);
|
|
FastAssignUtf8(result, tmp)
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
function AnyTextFileToSynUnicode(const FileName: TFileName; ForceUtf8: boolean): SynUnicode;
|
|
var
|
|
tmp: RawByteString;
|
|
buf: pointer;
|
|
len: PtrInt;
|
|
begin
|
|
if ForceUtf8 then
|
|
Utf8ToSynUnicode(StringFromFile(FileName), result)
|
|
else
|
|
case StringFromBomFile(FileName, tmp, buf, len) of
|
|
bomNone:
|
|
result := CurrentAnsiConvert.AnsiToUnicodeString(buf, len);
|
|
bomUnicode:
|
|
SetString(result, PWideChar(buf), len shr 1);
|
|
bomUtf8:
|
|
Utf8ToSynUnicode(buf, len, result);
|
|
end;
|
|
end;
|
|
|
|
function AnyTextFileToString(const FileName: TFileName; ForceUtf8: boolean): string;
|
|
var
|
|
tmp: RawByteString;
|
|
buf: pointer;
|
|
len: PtrInt;
|
|
begin
|
|
|
|
{$ifdef UNICODE}
|
|
if ForceUtf8 then
|
|
Utf8ToStringVar(StringFromFile(FileName), result)
|
|
else
|
|
case StringFromBomFile(FileName, tmp, buf, len) of
|
|
bomNone:
|
|
result := CurrentAnsiConvert.AnsiToUnicodeString(buf, len);
|
|
bomUnicode:
|
|
SetString(result, PWideChar(buf), len shr 1);
|
|
bomUtf8:
|
|
Utf8DecodeToString(buf, len, result);
|
|
end;
|
|
{$else}
|
|
if ForceUtf8 then
|
|
result := CurrentAnsiConvert.Utf8ToAnsi(StringFromFile(FileName))
|
|
else
|
|
case StringFromBomFile(FileName, tmp, buf, len) of
|
|
bomNone:
|
|
SetString(result, PAnsiChar(buf), len);
|
|
bomUnicode:
|
|
result := CurrentAnsiConvert.UnicodeBufferToAnsi(buf, len shr 1);
|
|
bomUtf8:
|
|
result := CurrentAnsiConvert.Utf8BufferToAnsi(buf, len);
|
|
end;
|
|
{$endif UNICODE}
|
|
end;
|
|
|
|
|
|
{ *************** Low-Level String Conversion Functions }
|
|
|
|
procedure AnyAnsiToUtf8(const s: RawByteString; var result: RawUtf8);
|
|
{$ifdef HASCODEPAGE}
|
|
var
|
|
cp: cardinal;
|
|
{$endif HASCODEPAGE}
|
|
begin
|
|
if s = '' then
|
|
result := ''
|
|
else
|
|
{$ifdef HASCODEPAGE}
|
|
begin
|
|
cp := GetCodePage(s);
|
|
if cp = CP_ACP then
|
|
begin
|
|
cp := Unicode_CodePage;
|
|
{$ifdef FPC}
|
|
if cp = CP_UTF8 then // happens on POSIX and with Lazarus - so FPC only
|
|
begin
|
|
if PStrRec(PAnsiChar(pointer(s)) - _STRRECSIZE)^.refCnt >= 0 then
|
|
begin
|
|
result := s; // not a read-only constant: assign by ref
|
|
FakeCodePage(RawByteString(result), cp); // override 0 by CP_UTF8
|
|
end
|
|
else
|
|
FastSetString(result, pointer(s), length(s)); // realloc constant
|
|
exit;
|
|
end;
|
|
{$endif FPC}
|
|
end;
|
|
if cp = CP_UTF8 then
|
|
result := s
|
|
else if cp >= CP_RAWBLOB then
|
|
FastSetString(result, pointer(s), length(s)) // no convert, just copy
|
|
else
|
|
TSynAnsiConvert.Engine(cp).AnsiBufferToRawUtf8(pointer(s), length(s), result);
|
|
end;
|
|
{$else}
|
|
CurrentAnsiConvert.AnsiBufferToRawUtf8(pointer(s), length(s), result);
|
|
{$endif HASCODEPAGE}
|
|
end;
|
|
|
|
function AnyAnsiToUtf8(const s: RawByteString): RawUtf8;
|
|
begin
|
|
AnyAnsiToUtf8(s, result);
|
|
end;
|
|
|
|
function WinAnsiBufferToUtf8(Dest: PUtf8Char;
|
|
Source: PAnsiChar; SourceChars: cardinal): PUtf8Char;
|
|
begin
|
|
result := WinAnsiConvert.AnsiBufferToUtf8(Dest, Source, SourceChars);
|
|
end;
|
|
|
|
function ShortStringToUtf8(const source: ShortString): RawUtf8;
|
|
begin
|
|
WinAnsiConvert.AnsiBufferToRawUtf8(@source[1], ord(source[0]), result);
|
|
end;
|
|
|
|
procedure WinAnsiToUnicodeBuffer(const S: WinAnsiString; Dest: PWordArray; DestLen: PtrInt);
|
|
var
|
|
L: PtrInt;
|
|
begin
|
|
L := length(S);
|
|
if L <> 0 then
|
|
begin
|
|
if L >= DestLen then
|
|
L := DestLen - 1; // truncate to avoid buffer overflow
|
|
WinAnsiConvert.AnsiBufferToUnicode(PWideChar(Dest), pointer(S), L);
|
|
// including last #0
|
|
end
|
|
else
|
|
Dest^[0] := 0;
|
|
end;
|
|
|
|
{$ifndef PUREMORMOT2}
|
|
function WinAnsiToRawUnicode(const S: WinAnsiString): RawUnicode;
|
|
begin
|
|
result := WinAnsiConvert.AnsiToRawUnicode(S);
|
|
end;
|
|
{$endif PUREMORMOT2}
|
|
|
|
function WinAnsiToUtf8(const S: WinAnsiString): RawUtf8;
|
|
begin
|
|
WinAnsiConvert.AnsiBufferToRawUtf8(pointer(S), length(S), result);
|
|
end;
|
|
|
|
function WinAnsiToUtf8(WinAnsi: PAnsiChar; WinAnsiLen: PtrInt): RawUtf8;
|
|
begin
|
|
WinAnsiConvert.AnsiBufferToRawUtf8(WinAnsi, WinAnsiLen, result);
|
|
end;
|
|
|
|
function WideCharToWinAnsiChar(wc: cardinal): AnsiChar;
|
|
begin
|
|
wc := WinAnsiConvert.WideCharToAnsiChar(wc);
|
|
if integer(wc) = -1 then
|
|
result := '?'
|
|
else
|
|
result := AnsiChar(wc);
|
|
end;
|
|
|
|
function WideCharToWinAnsi(wc: cardinal): integer;
|
|
begin
|
|
result := WinAnsiConvert.WideCharToAnsiChar(wc);
|
|
end;
|
|
|
|
function IsWinAnsi(WideText: PWideChar; Length: integer): boolean;
|
|
begin
|
|
result := WinAnsiConvert.IsValidAnsi(WideText, Length);
|
|
end;
|
|
|
|
function IsWinAnsi(WideText: PWideChar): boolean;
|
|
begin
|
|
result := WinAnsiConvert.IsValidAnsi(WideText);
|
|
end;
|
|
|
|
function IsWinAnsiU(Utf8Text: PUtf8Char): boolean;
|
|
begin
|
|
result := WinAnsiConvert.IsValidAnsiU(Utf8Text);
|
|
end;
|
|
|
|
function IsWinAnsiU8Bit(Utf8Text: PUtf8Char): boolean;
|
|
begin
|
|
result := WinAnsiConvert.IsValidAnsiU8Bit(Utf8Text);
|
|
end;
|
|
|
|
function Utf8ToWinPChar(dest: PAnsiChar; source: PUtf8Char; count: integer): integer;
|
|
begin
|
|
result := WinAnsiConvert.Utf8BufferToAnsi(dest, source, count) - dest;
|
|
end;
|
|
|
|
function Utf8ToWinAnsi(const S: RawUtf8): WinAnsiString;
|
|
begin
|
|
result := WinAnsiConvert.Utf8ToAnsi(S);
|
|
end;
|
|
|
|
function Utf8ToWinAnsi(P: PUtf8Char): WinAnsiString;
|
|
begin
|
|
result := WinAnsiConvert.Utf8ToAnsi(P);
|
|
end;
|
|
|
|
procedure Utf8ToRawUtf8(P: PUtf8Char; var result: RawUtf8);
|
|
begin
|
|
// fast and Delphi 2009+ ready
|
|
FastSetString(result, P, StrLen(P));
|
|
end;
|
|
|
|
{$ifndef PUREMORMOT2}
|
|
|
|
function Utf8DecodeToRawUnicode(P: PUtf8Char; L: integer): RawUnicode;
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
begin
|
|
result := ''; // somewhat faster if result is freed before any SetLength()
|
|
if L = 0 then
|
|
L := StrLen(P);
|
|
if L = 0 then
|
|
exit;
|
|
// +1 below is for #0 ending -> true WideChar(#0) ending
|
|
tmp.Init(L * 3); // maximum posible unicode size (if all <#128)
|
|
SetString(result, PAnsiChar(tmp.buf), Utf8ToWideChar(tmp.buf, P, L) + 1);
|
|
tmp.Done;
|
|
end;
|
|
|
|
function Utf8DecodeToRawUnicode(const S: RawUtf8): RawUnicode;
|
|
begin
|
|
if S = '' then
|
|
result := ''
|
|
else
|
|
result := Utf8DecodeToRawUnicode(pointer(S), Length(S));
|
|
end;
|
|
|
|
function Utf8DecodeToRawUnicodeUI(const S: RawUtf8; DestLen: PInteger): RawUnicode;
|
|
var
|
|
L: integer;
|
|
begin
|
|
L := Utf8DecodeToRawUnicodeUI(S, result);
|
|
if DestLen <> nil then
|
|
DestLen^ := L;
|
|
end;
|
|
|
|
function Utf8DecodeToRawUnicodeUI(const S: RawUtf8; var Dest: RawUnicode): integer;
|
|
begin
|
|
Dest := ''; // somewhat faster if Dest is freed before any SetLength()
|
|
if S = '' then
|
|
begin
|
|
result := 0;
|
|
exit;
|
|
end;
|
|
result := Length(S);
|
|
SetLength(Dest, result * 2 + 2);
|
|
result := Utf8ToWideChar(pointer(Dest), Pointer(S), result);
|
|
end;
|
|
|
|
function RawUnicodeToUtf8(const Unicode: RawUnicode): RawUtf8;
|
|
begin
|
|
RawUnicodeToUtf8(pointer(Unicode), Length(Unicode) shr 1, result);
|
|
end;
|
|
|
|
function RawUnicodeToSynUnicode(const Unicode: RawUnicode): SynUnicode;
|
|
begin
|
|
SetString(result, PWideChar(pointer(Unicode)), Length(Unicode) shr 1);
|
|
end;
|
|
|
|
function RawUnicodeToWinAnsi(const Unicode: RawUnicode): WinAnsiString;
|
|
begin
|
|
result := WinAnsiConvert.UnicodeBufferToAnsi(pointer(Unicode), Length(Unicode) shr 1);
|
|
end;
|
|
|
|
{$endif PUREMORMOT2}
|
|
|
|
function SynUnicodeToUtf8(const Unicode: SynUnicode): RawUtf8;
|
|
begin
|
|
RawUnicodeToUtf8(pointer(Unicode), Length(Unicode), result);
|
|
end;
|
|
|
|
function RawUnicodeToSynUnicode(WideChar: PWideChar; WideCharCount: integer): SynUnicode;
|
|
begin
|
|
SetString(result, WideChar, WideCharCount);
|
|
end;
|
|
|
|
procedure RawUnicodeToWinPChar(dest: PAnsiChar; source: PWideChar; WideCharCount: integer);
|
|
begin
|
|
WinAnsiConvert.UnicodeBufferToAnsi(dest, source, WideCharCount);
|
|
end;
|
|
|
|
function RawUnicodeToWinAnsi(WideChar: PWideChar; WideCharCount: integer): WinAnsiString;
|
|
begin
|
|
result := WinAnsiConvert.UnicodeBufferToAnsi(WideChar, WideCharCount);
|
|
end;
|
|
|
|
function WideStringToWinAnsi(const Wide: WideString): WinAnsiString;
|
|
begin
|
|
result := WinAnsiConvert.UnicodeBufferToAnsi(pointer(Wide), Length(Wide));
|
|
end;
|
|
|
|
procedure UnicodeBufferToWinAnsi(source: PWideChar; out Dest: WinAnsiString);
|
|
var
|
|
L: integer;
|
|
begin
|
|
L := StrLenW(source);
|
|
SetLength(Dest, L);
|
|
WinAnsiConvert.UnicodeBufferToAnsi(pointer(Dest), source, L);
|
|
end;
|
|
|
|
function UnicodeBufferToString(source: PWideChar): string;
|
|
begin
|
|
result := RawUnicodeToString(source, StrLenW(source));
|
|
end;
|
|
|
|
function UnicodeBufferToUtf8(source: PWideChar): RawUtf8;
|
|
begin
|
|
RawUnicodeToUtf8(source, StrLenW(source), result);
|
|
end;
|
|
|
|
function UnicodeBufferToVariant(source: PWideChar): variant;
|
|
begin
|
|
ClearVariantForString(result);
|
|
if Source <> nil then
|
|
RawUnicodeToUtf8(source, StrLenW(source), RawUtf8(TVarData(result).VAny));
|
|
end;
|
|
|
|
function StringToVariant(const Txt: string): variant;
|
|
begin
|
|
StringToVariant(Txt, result);
|
|
end;
|
|
|
|
procedure StringToVariant(const Txt: string; var result: variant);
|
|
begin
|
|
ClearVariantForString(result);
|
|
if Txt <> '' then
|
|
{$ifndef UNICODE}
|
|
if (Unicode_CodePage = CP_UTF8) or
|
|
IsAnsiCompatible(Txt) then
|
|
begin
|
|
RawByteString(TVarData(result).VAny) := Txt;
|
|
EnsureRawUtf8(RawByteString(TVarData(result).VAny));
|
|
end
|
|
else
|
|
{$endif UNICODE}
|
|
StringToUtf8(Txt, RawUtf8(TVarData(result).VAny));
|
|
end;
|
|
|
|
procedure AnsiCharToUtf8(P: PAnsiChar; L: integer; var result: RawUtf8;
|
|
CodePage: integer);
|
|
begin
|
|
TSynAnsiConvert.Engine(CodePage).AnsiBufferToRawUtf8(P, L, result);
|
|
end;
|
|
|
|
function AnsiToUtf8(const Ansi: RawByteString; CodePage: integer): RawUtf8;
|
|
begin
|
|
if Ansi = '' then
|
|
result := ''
|
|
else
|
|
result := TSynAnsiConvert.Engine(CodePage).AnsiToUtf8(Ansi);
|
|
end;
|
|
|
|
function AnsiToString(const Ansi: RawByteString; CodePage: integer): string;
|
|
begin
|
|
if Ansi = '' then
|
|
result := ''
|
|
else
|
|
{$ifdef UNICODE}
|
|
result := TSynAnsiConvert.Engine(CodePage).AnsiToUnicodeString(Ansi);
|
|
{$else}
|
|
result := CurrentAnsiConvert.AnsiToAnsi(TSynAnsiConvert.Engine(CodePage), Ansi);
|
|
{$endif UNICODE}
|
|
end;
|
|
|
|
function AnsiBufferToTempUtf8(var Temp: TSynTempBuffer; Buf: PAnsiChar; BufLen,
|
|
CodePage: cardinal): PUtf8Char;
|
|
begin
|
|
if (BufLen = 0) or
|
|
(CodePage = CP_UTF8) or
|
|
(CodePage >= CP_RAWBLOB) or
|
|
IsAnsiCompatible(Buf, BufLen) then
|
|
begin
|
|
temp.Buf := nil;
|
|
temp.len := BufLen;
|
|
result := PUtf8Char(Buf);
|
|
end
|
|
else
|
|
begin
|
|
temp.Init(BufLen * 3);
|
|
Buf := pointer(TSynAnsiConvert.Engine(CodePage).
|
|
AnsiBufferToUtf8(temp.Buf, Buf, BufLen));
|
|
temp.len := Buf - PAnsiChar(temp.Buf);
|
|
result := temp.Buf;
|
|
end;
|
|
end;
|
|
|
|
{$ifdef UNICODE}
|
|
|
|
function Ansi7ToString(const Text: RawByteString): string;
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
SetString(result, nil, Length(Text));
|
|
for i := 0 to Length(Text) - 1 do
|
|
PWordArray(result)[i] := PByteArray(Text)[i]; // no conversion for 7-bit Ansi
|
|
end;
|
|
|
|
function Ansi7ToString(Text: PWinAnsiChar; Len: PtrInt): string;
|
|
begin
|
|
Ansi7ToString(Text, Len, result);
|
|
end;
|
|
|
|
procedure Ansi7ToString(Text: PWinAnsiChar; Len: PtrInt; var result: string);
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
SetString(result, nil, Len);
|
|
for i := 0 to Len - 1 do
|
|
PWordArray(result)[i] := PByteArray(Text)[i]; // no conversion for 7-bit Ansi
|
|
end;
|
|
|
|
function StringToAnsi7(const Text: string): RawByteString;
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
SetString(result, nil, Length(Text));
|
|
for i := 0 to Length(Text) - 1 do
|
|
PByteArray(result)[i] := PWordArray(Text)[i]; // no conversion for 7-bit Ansi
|
|
end;
|
|
|
|
function StringToWinAnsi(const Text: string): WinAnsiString;
|
|
begin
|
|
result := RawUnicodeToWinAnsi(Pointer(Text), Length(Text));
|
|
end;
|
|
|
|
function StringBufferToUtf8(Dest: PUtf8Char; Source: PChar; SourceChars: PtrInt): PUtf8Char;
|
|
begin
|
|
result := Dest + RawUnicodeToUtf8(Dest, SourceChars * 3, PWideChar(Source), SourceChars, []);
|
|
end;
|
|
|
|
procedure StringBufferToUtf8(Source: PChar; out result: RawUtf8);
|
|
begin
|
|
RawUnicodeToUtf8(Source, StrLenW(Source), result);
|
|
end;
|
|
|
|
function StringToUtf8(const Text: string): RawUtf8;
|
|
begin
|
|
RawUnicodeToUtf8(pointer(Text), Length(Text), result);
|
|
end;
|
|
|
|
procedure StringToUtf8(Text: PChar; TextLen: PtrInt; var result: RawUtf8);
|
|
begin
|
|
RawUnicodeToUtf8(Text, TextLen, result);
|
|
end;
|
|
|
|
procedure StringToUtf8(const Text: string; var result: RawUtf8);
|
|
begin
|
|
RawUnicodeToUtf8(pointer(Text), Length(Text), result);
|
|
end;
|
|
|
|
function StringToUtf8(const Text: string; var Temp: TSynTempBuffer): integer;
|
|
var
|
|
len: integer;
|
|
begin
|
|
len := length(Text);
|
|
Temp.Init(len * 3);
|
|
result := RawUnicodeToUtf8(Temp.buf, Temp.len + 1, pointer(Text), len, []);
|
|
end;
|
|
|
|
function ToUtf8(const Text: string): RawUtf8;
|
|
begin
|
|
RawUnicodeToUtf8(pointer(Text), Length(Text), result);
|
|
end;
|
|
|
|
{$ifndef PUREMORMOT2}
|
|
|
|
function StringToRawUnicode(const S: string): RawUnicode;
|
|
begin
|
|
SetString(result, PAnsiChar(pointer(S)), length(S) * 2 + 1); // +1 for last wide #0
|
|
end;
|
|
|
|
function StringToRawUnicode(P: PChar; L: integer): RawUnicode;
|
|
begin
|
|
SetString(result, PAnsiChar(P), L * 2 + 1); // +1 for last wide #0
|
|
end;
|
|
|
|
function RawUnicodeToString(const U: RawUnicode): string;
|
|
begin
|
|
// uses StrLenW() and not length(U) to handle case when was used as buffer
|
|
SetString(result, PWideChar(pointer(U)), StrLenW(Pointer(U)));
|
|
end;
|
|
|
|
{$endif PUREMORMOT2}
|
|
|
|
function StringToSynUnicode(const S: string): SynUnicode;
|
|
begin
|
|
result := S;
|
|
end;
|
|
|
|
procedure StringToSynUnicode(const S: string; var result: SynUnicode);
|
|
begin
|
|
result := S;
|
|
end;
|
|
|
|
function RawUnicodeToString(P: PWideChar; L: integer): string;
|
|
begin
|
|
SetString(result, P, L);
|
|
end;
|
|
|
|
procedure RawUnicodeToString(P: PWideChar; L: integer; var result: string);
|
|
begin
|
|
SetString(result, P, L);
|
|
end;
|
|
|
|
function SynUnicodeToString(const U: SynUnicode): string;
|
|
begin
|
|
result := U;
|
|
end;
|
|
|
|
function Utf8DecodeToString(P: PUtf8Char; L: integer): string;
|
|
begin
|
|
Utf8DecodeToUnicodeString(P, L, result);
|
|
end;
|
|
|
|
procedure Utf8DecodeToString(P: PUtf8Char; L: integer; var result: string);
|
|
begin
|
|
Utf8DecodeToUnicodeString(P, L, result);
|
|
end;
|
|
|
|
function Utf8ToString(const Text: RawUtf8): string;
|
|
begin
|
|
Utf8DecodeToUnicodeString(pointer(Text), length(Text), result);
|
|
end;
|
|
|
|
procedure Utf8ToStringVar(const Text: RawUtf8; var result: string);
|
|
begin
|
|
Utf8DecodeToUnicodeString(pointer(Text), length(Text), result);
|
|
end;
|
|
|
|
procedure Utf8ToFileName(const Text: RawUtf8; var result: TFileName);
|
|
begin
|
|
Utf8DecodeToUnicodeString(pointer(Text), length(Text), string(result));
|
|
end;
|
|
|
|
{$else}
|
|
|
|
function Ansi7ToString(const Text: RawByteString): string;
|
|
begin
|
|
result := Text; // if we are SURE this text is 7-bit Ansi -> direct assign
|
|
{$ifdef FPC} // if Text is CP_RAWBYTESTRING then FPC won't handle it properly
|
|
SetCodePage(RawByteString(result), DefaultSystemCodePage, false);
|
|
{$endif FPC} // no FakeCodePage() since Text may be read-only
|
|
end;
|
|
|
|
function Ansi7ToString(Text: PWinAnsiChar; Len: PtrInt): string;
|
|
begin
|
|
SetString(result, PAnsiChar(Text), Len);
|
|
end;
|
|
|
|
procedure Ansi7ToString(Text: PWinAnsiChar; Len: PtrInt; var result: string);
|
|
begin
|
|
SetString(result, PAnsiChar(Text), Len);
|
|
end;
|
|
|
|
function StringToAnsi7(const Text: string): RawByteString;
|
|
begin
|
|
result := Text; // if we are SURE this text is 7-bit Ansi -> direct assign
|
|
end;
|
|
|
|
function StringToWinAnsi(const Text: string): WinAnsiString;
|
|
begin
|
|
result := WinAnsiConvert.AnsiToAnsi(CurrentAnsiConvert, Text);
|
|
end;
|
|
|
|
function StringBufferToUtf8(Dest: PUtf8Char; Source: PChar; SourceChars: PtrInt): PUtf8Char;
|
|
begin
|
|
result := CurrentAnsiConvert.AnsiBufferToUtf8(Dest, Source, SourceChars);
|
|
end;
|
|
|
|
procedure StringBufferToUtf8(Source: PChar; out result: RawUtf8);
|
|
begin
|
|
CurrentAnsiConvert.AnsiBufferToRawUtf8(Source, StrLen(Source), result);
|
|
end;
|
|
|
|
function StringToUtf8(const Text: string): RawUtf8;
|
|
begin
|
|
result := CurrentAnsiConvert.AnsiToUtf8(Text);
|
|
end;
|
|
|
|
procedure StringToUtf8(Text: PChar; TextLen: PtrInt; var result: RawUtf8);
|
|
begin
|
|
CurrentAnsiConvert.AnsiBufferToRawUtf8(Text, TextLen, result);
|
|
end;
|
|
|
|
procedure StringToUtf8(const Text: string; var result: RawUtf8);
|
|
begin
|
|
result := CurrentAnsiConvert.AnsiToUtf8(Text);
|
|
end;
|
|
|
|
function StringToUtf8(const Text: string; var Temp: TSynTempBuffer): integer;
|
|
var
|
|
len: integer;
|
|
begin
|
|
len := length(Text);
|
|
Temp.Init(len * 3);
|
|
if len <> 0 then
|
|
result := CurrentAnsiConvert.
|
|
AnsiBufferToUtf8(Temp.buf, pointer(Text), len) - PUtf8Char(Temp.buf)
|
|
else
|
|
result := 0;
|
|
end;
|
|
|
|
function ToUtf8(const Text: string): RawUtf8;
|
|
begin
|
|
result := CurrentAnsiConvert.AnsiToUtf8(Text);
|
|
end;
|
|
|
|
{$ifndef PUREMORMOT2}
|
|
|
|
function StringToRawUnicode(const S: string): RawUnicode;
|
|
begin
|
|
result := CurrentAnsiConvert.AnsiToRawUnicode(S);
|
|
end;
|
|
|
|
function StringToRawUnicode(P: PChar; L: integer): RawUnicode;
|
|
begin
|
|
result := CurrentAnsiConvert.AnsiToRawUnicode(P, L);
|
|
end;
|
|
|
|
function RawUnicodeToString(const U: RawUnicode): string;
|
|
begin
|
|
// uses StrLenW() and not length(U) to handle case when was used as buffer
|
|
result := CurrentAnsiConvert.UnicodeBufferToAnsi(Pointer(U), StrLenW(Pointer(U)));
|
|
end;
|
|
|
|
{$endif PUREMORMOT2}
|
|
|
|
function StringToSynUnicode(const S: string): SynUnicode;
|
|
begin
|
|
result := CurrentAnsiConvert.AnsiToUnicodeString(pointer(S), length(S));
|
|
end;
|
|
|
|
procedure StringToSynUnicode(const S: string; var result: SynUnicode);
|
|
begin
|
|
result := CurrentAnsiConvert.AnsiToUnicodeString(pointer(S), length(S));
|
|
end;
|
|
|
|
function RawUnicodeToString(P: PWideChar; L: integer): string;
|
|
begin
|
|
result := CurrentAnsiConvert.UnicodeBufferToAnsi(P, L);
|
|
end;
|
|
|
|
procedure RawUnicodeToString(P: PWideChar; L: integer; var result: string);
|
|
begin
|
|
result := CurrentAnsiConvert.UnicodeBufferToAnsi(P, L);
|
|
end;
|
|
|
|
function SynUnicodeToString(const U: SynUnicode): string;
|
|
begin
|
|
result := CurrentAnsiConvert.UnicodeBufferToAnsi(Pointer(U), length(U));
|
|
end;
|
|
|
|
function Utf8DecodeToString(P: PUtf8Char; L: integer): string;
|
|
begin
|
|
CurrentAnsiConvert.Utf8BufferToAnsi(P, L, RawByteString(result));
|
|
end;
|
|
|
|
procedure Utf8DecodeToString(P: PUtf8Char; L: integer; var result: string);
|
|
begin
|
|
CurrentAnsiConvert.Utf8BufferToAnsi(P, L, RawByteString(result));
|
|
end;
|
|
|
|
function Utf8ToString(const Text: RawUtf8): string;
|
|
begin
|
|
result := CurrentAnsiConvert.Utf8ToAnsi(Text);
|
|
end;
|
|
|
|
procedure Utf8ToStringVar(const Text: RawUtf8; var result: string);
|
|
begin
|
|
result := CurrentAnsiConvert.Utf8ToAnsi(Text);
|
|
end;
|
|
|
|
procedure Utf8ToFileName(const Text: RawUtf8; var result: TFileName);
|
|
begin
|
|
result := CurrentAnsiConvert.Utf8ToAnsi(Text);
|
|
end;
|
|
|
|
{$endif UNICODE}
|
|
|
|
function ToUtf8(const Ansi7Text: ShortString): RawUtf8;
|
|
begin
|
|
FastSetString(result, @Ansi7Text[1], ord(Ansi7Text[0]));
|
|
end;
|
|
|
|
{$ifdef HASVARUSTRING} // some UnicodeString dedicated functions
|
|
|
|
function UnicodeStringToUtf8(const S: UnicodeString): RawUtf8;
|
|
begin
|
|
RawUnicodeToUtf8(pointer(S), Length(S), result);
|
|
end;
|
|
|
|
function Utf8DecodeToUnicodeString(const S: RawUtf8): UnicodeString;
|
|
begin
|
|
Utf8DecodeToUnicodeString(pointer(S), Length(S), result);
|
|
end;
|
|
|
|
procedure Utf8DecodeToUnicodeString(P: PUtf8Char; L: integer; var result: UnicodeString);
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
begin
|
|
if (P = nil) or
|
|
(L = 0) then
|
|
result := ''
|
|
else
|
|
begin
|
|
tmp.Init(L * 3); // maximum posible unicode size (if all <#128)
|
|
SetString(result, PWideChar(tmp.buf), Utf8ToWideChar(tmp.buf, P, L) shr 1);
|
|
tmp.Done;
|
|
end;
|
|
end;
|
|
|
|
function UnicodeStringToWinAnsi(const S: UnicodeString): WinAnsiString;
|
|
begin
|
|
result := WinAnsiConvert.UnicodeBufferToAnsi(pointer(S), Length(S));
|
|
end;
|
|
|
|
function Utf8DecodeToUnicodeString(P: PUtf8Char; L: integer): UnicodeString;
|
|
begin
|
|
Utf8DecodeToUnicodeString(P, L, result);
|
|
end;
|
|
|
|
function WinAnsiToUnicodeString(WinAnsi: PAnsiChar; WinAnsiLen: PtrInt): UnicodeString;
|
|
begin
|
|
SetString(result, nil, WinAnsiLen);
|
|
WinAnsiConvert.AnsiBufferToUnicode(pointer(result), WinAnsi, WinAnsiLen);
|
|
end;
|
|
|
|
function WinAnsiToUnicodeString(const WinAnsi: WinAnsiString): UnicodeString;
|
|
begin
|
|
result := WinAnsiToUnicodeString(pointer(WinAnsi), Length(WinAnsi));
|
|
end;
|
|
|
|
{$endif HASVARUSTRING}
|
|
|
|
function Utf8DecodeToUnicodeRawByteString(P: PUtf8Char; L: integer): RawByteString;
|
|
begin
|
|
if (P <> nil) and
|
|
(L <> 0) then
|
|
begin
|
|
FastNewRawByteString(result, L * 3);
|
|
FakeSetLength(result, Utf8ToWideChar(pointer(result), P, L));
|
|
end
|
|
else
|
|
result := '';
|
|
end;
|
|
|
|
function Utf8DecodeToUnicodeRawByteString(const U: RawUtf8): RawByteString;
|
|
begin
|
|
result := Utf8DecodeToUnicodeRawByteString(pointer(U), length(U));
|
|
end;
|
|
|
|
function Utf8DecodeToUnicodeStream(P: PUtf8Char; L: integer): TStream;
|
|
begin
|
|
result := TRawByteStringStream.Create(Utf8DecodeToUnicodeRawByteString(P, L));
|
|
end;
|
|
|
|
function WinAnsiToSynUnicode(WinAnsi: PAnsiChar; WinAnsiLen: PtrInt): SynUnicode;
|
|
begin
|
|
SetString(result, nil, WinAnsiLen);
|
|
WinAnsiConvert.AnsiBufferToUnicode(pointer(result), WinAnsi, WinAnsiLen);
|
|
end;
|
|
|
|
function WinAnsiToSynUnicode(const WinAnsi: WinAnsiString): SynUnicode;
|
|
begin
|
|
result := WinAnsiToSynUnicode(pointer(WinAnsi), Length(WinAnsi));
|
|
end;
|
|
|
|
procedure UniqueRawUtf8ZeroToTilde(var u: RawUtf8; MaxSize: PtrInt);
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
i := length(u);
|
|
if i > MaxSize then
|
|
PByteArray(u)[MaxSize] := 0
|
|
else
|
|
MaxSize := i;
|
|
for i := 0 to MaxSize - 1 do
|
|
if PByteArray(u)[i] = 0 then
|
|
PByteArray(u)[i] := ord('~');
|
|
end;
|
|
|
|
const
|
|
ZEROED_CW = '~'; // any byte would do - followed by ~ or 0
|
|
|
|
function UnZeroed(const bin: RawByteString): RawUtf8;
|
|
var
|
|
len, z, c: PtrInt;
|
|
a: AnsiChar;
|
|
s, d: PAnsiChar;
|
|
begin
|
|
result := '';
|
|
len := length(bin);
|
|
if len = 0 then
|
|
exit;
|
|
s := pointer(bin);
|
|
z := StrLen(s);
|
|
c := ByteScanIndex(pointer(s), len, ord(ZEROED_CW));
|
|
if (z = len) and
|
|
(c < 0) then
|
|
begin
|
|
result := bin; // nothing to convert
|
|
exit;
|
|
end;
|
|
if (c < 0) or
|
|
(z < c) then
|
|
c := z;
|
|
FastSetString(result, len shl 1);
|
|
d := pointer(result);
|
|
MoveFast(s^, d^, c);
|
|
inc(s, c);
|
|
inc(d, c);
|
|
dec(len, c);
|
|
repeat
|
|
a := s^;
|
|
if a = #0 then
|
|
begin
|
|
d^ := ZEROED_CW;
|
|
inc(d);
|
|
a := '0';
|
|
end
|
|
else if a = ZEROED_CW then
|
|
begin
|
|
d^ := ZEROED_CW;
|
|
inc(d);
|
|
end;
|
|
d^ := a;
|
|
inc(d);
|
|
inc(s);
|
|
dec(len);
|
|
until len = 0;
|
|
FakeLength(result, d - pointer(result));
|
|
end;
|
|
|
|
function Zeroed(const u: RawUtf8): RawByteString;
|
|
var
|
|
len, c: PtrInt;
|
|
a: AnsiChar;
|
|
s, d: PAnsiChar;
|
|
begin
|
|
result := '';
|
|
len := length(u);
|
|
if len = 0 then
|
|
exit;
|
|
s := pointer(u);
|
|
c := ByteScanIndex(pointer(s), len, ord(ZEROED_CW));
|
|
if c < 0 then
|
|
begin
|
|
result := u;
|
|
exit;
|
|
end;
|
|
FastNewRawByteString(result, len);
|
|
d := pointer(result);
|
|
MoveFast(s^, d^, c);
|
|
inc(s, c);
|
|
inc(d, c);
|
|
dec(len, c);
|
|
repeat
|
|
a := s^;
|
|
if a = ZEROED_CW then
|
|
begin
|
|
inc(s);
|
|
dec(len);
|
|
if s^ = '0' then
|
|
a := #0;
|
|
end;
|
|
d^ := a;
|
|
inc(d);
|
|
inc(s);
|
|
dec(len);
|
|
until len = 0;
|
|
FakeLength(result, d - pointer(result));
|
|
end;
|
|
|
|
procedure Utf8ToWideString(const Text: RawUtf8; var result: WideString);
|
|
begin
|
|
Utf8ToWideString(pointer(Text), Length(Text), result);
|
|
end;
|
|
|
|
function Utf8ToWideString(const Text: RawUtf8): WideString;
|
|
begin
|
|
{$ifdef FPC}
|
|
Finalize(result);
|
|
{$endif FPC}
|
|
Utf8ToWideString(pointer(Text), Length(Text), result);
|
|
end;
|
|
|
|
procedure Utf8ToWideString(Text: PUtf8Char; Len: PtrInt; var result: WideString);
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
begin
|
|
if (Text = nil) or
|
|
(Len = 0) then
|
|
result := ''
|
|
else
|
|
begin
|
|
tmp.Init(Len * 3); // maximum posible unicode size (if all <#128)
|
|
SetString(result, PWideChar(tmp.buf), Utf8ToWideChar(tmp.buf, Text, Len) shr 1);
|
|
tmp.Done;
|
|
end;
|
|
end;
|
|
|
|
function WideStringToUtf8(const aText: WideString): RawUtf8;
|
|
begin
|
|
RawUnicodeToUtf8(pointer(aText), length(aText), result);
|
|
end;
|
|
|
|
function Utf8ToSynUnicode(const Text: RawUtf8): SynUnicode;
|
|
begin
|
|
Utf8ToSynUnicode(pointer(Text), length(Text), result);
|
|
end;
|
|
|
|
procedure Utf8ToSynUnicode(const Text: RawUtf8; var result: SynUnicode);
|
|
begin
|
|
Utf8ToSynUnicode(pointer(Text), length(Text), result);
|
|
end;
|
|
|
|
procedure Utf8ToSynUnicode(Text: PUtf8Char; Len: PtrInt; var result: SynUnicode);
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
n: PtrInt;
|
|
begin
|
|
n := Utf8DecodeToUnicode(Text, Len, tmp);
|
|
SetString(result, PWideChar(tmp.buf), n);
|
|
if n <> 0 then
|
|
tmp.Done;
|
|
end;
|
|
|
|
function Utf8DecodeToUnicode(const Text: RawUtf8; var temp: TSynTempBuffer): PtrInt;
|
|
begin
|
|
result := Utf8DecodeToUnicode(pointer(Text), length(Text), temp);
|
|
end;
|
|
|
|
function Utf8DecodeToUnicode(Text: PUtf8Char; Len: PtrInt; var temp: TSynTempBuffer): PtrInt;
|
|
begin
|
|
if (Text = nil) or
|
|
(Len <= 0) then
|
|
begin
|
|
temp.buf := nil;
|
|
temp.len := 0;
|
|
result := 0;
|
|
end
|
|
else
|
|
begin
|
|
temp.Init(Len * 3); // maximum posible unicode size (if all <#128)
|
|
result := Utf8ToWideChar(temp.buf, Text, Len) shr 1;
|
|
end;
|
|
end;
|
|
|
|
|
|
{ **************** Text Case-(in)sensitive Conversion and Comparison }
|
|
|
|
function IdemPropNameUSameLenNotNull(P1, P2: PUtf8Char; P1P2Len: PtrInt): boolean;
|
|
label
|
|
zero;
|
|
begin
|
|
{$ifndef CPUX86}
|
|
result := false;
|
|
{$endif CPUX86}
|
|
pointer(P1P2Len) := @P1[P1P2Len - SizeOf(cardinal)];
|
|
dec(PtrUInt(P2), PtrUInt(P1));
|
|
while PtrUInt(P1P2Len) >= PtrUInt(P1) do
|
|
// compare 4 Bytes per loop
|
|
if (PCardinal(P1)^ xor PCardinal(@P2[PtrUInt(P1)])^) and $dfdfdfdf <> 0 then
|
|
goto zero
|
|
else
|
|
inc(PCardinal(P1));
|
|
inc(P1P2Len, SizeOf(cardinal));
|
|
while PtrUInt(P1) < PtrUInt(P1P2Len) do
|
|
if (ord(P1^) xor ord(P2[PtrUInt(P1)])) and $df <> 0 then
|
|
goto zero
|
|
else
|
|
inc(PByte(P1));
|
|
result := true;
|
|
exit;
|
|
zero:
|
|
{$ifdef CPUX86}
|
|
result := false;
|
|
{$endif CPUX86}
|
|
end;
|
|
|
|
function PropNameValid(P: PUtf8Char): boolean;
|
|
var
|
|
tab: PTextCharSet;
|
|
{%H-}begin
|
|
tab := @TEXT_CHARS;
|
|
if (P <> nil) and
|
|
(tcIdentifierFirstChar in tab[P^]) then
|
|
// first char must be in ['_', 'a'..'z', 'A'..'Z']
|
|
repeat
|
|
inc(P); // following chars can be ['_', '0'..'9', 'a'..'z', 'A'..'Z']
|
|
if tcIdentifier in tab[P^] then
|
|
continue;
|
|
result := P^ = #0;
|
|
exit;
|
|
until false
|
|
else
|
|
result := false;
|
|
end;
|
|
|
|
function PropNamesValid(const Values: array of RawUtf8): boolean;
|
|
var
|
|
i, j: PtrInt;
|
|
tab: PTextCharSet;
|
|
begin
|
|
result := false;
|
|
tab := @TEXT_CHARS;
|
|
for i := 0 to high(Values) do
|
|
for j := 1 to length(Values[i]) do
|
|
if not (tcIdentifier in tab[Values[i][j]]) then
|
|
exit; // not ['_', '0'..'9', 'a'..'z', 'A'..'Z']
|
|
result := true;
|
|
end;
|
|
|
|
function PropNameSanitize(const text, fallback: RawUtf8): RawUtf8;
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
result := text;
|
|
for i := length(result) downto 1 do
|
|
if result[i] in [#0 .. ' ', '#', '"', '''', '*'] then
|
|
delete(result, i, 1);
|
|
for i := 1 to length(result) do
|
|
if result[i] in ['[', ']', '/', '\', '&', '@', '+', '-', '.'] then
|
|
result[i] := '_';
|
|
if not PropNameValid(pointer(result)) then
|
|
result := fallback; // it was not good enough
|
|
end;
|
|
|
|
function IdemPropName(const P1, P2: ShortString): boolean;
|
|
begin
|
|
result := (P1[0] = P2[0]) and
|
|
((P1[0] = #0) or
|
|
(((ord(P1[1]) xor ord(P2[1])) and $df = 0) and
|
|
IdemPropNameUSameLenNotNull(@P1[1], @P2[1], ord(P2[0]))));
|
|
end;
|
|
|
|
function IdemPropName(const P1: ShortString; P2: PUtf8Char; P2Len: PtrInt): boolean;
|
|
begin
|
|
result := (ord(P1[0]) = P2Len) and
|
|
((P2Len = 0) or
|
|
IdemPropNameUSameLenNotNull(@P1[1], P2, P2Len));
|
|
end;
|
|
|
|
function IdemPropName(P1, P2: PUtf8Char; P1Len, P2Len: PtrInt): boolean;
|
|
begin
|
|
result := (P1Len = P2Len) and
|
|
((P1Len = 0) or
|
|
IdemPropNameUSameLenNotNull(P1, P2, P1Len));
|
|
end;
|
|
|
|
function IdemPropNameU(const P1: RawUtf8; P2: PUtf8Char; P2Len: PtrInt): boolean;
|
|
begin
|
|
if PtrUInt(P1) <> 0 then
|
|
result := (PStrLen(PAnsiChar(pointer(P1)) - _STRLEN)^ = P2Len) and
|
|
((PByte(P1)^ xor PByte(P2)^) and $df = 0) and
|
|
IdemPropNameUSameLenNotNull(pointer(P1), pointer(P2), P2Len)
|
|
else
|
|
result := P2Len = 0;
|
|
end;
|
|
|
|
function IdemPropNameU(const P1, P2: RawUtf8): boolean;
|
|
var
|
|
L: TStrLen;
|
|
begin
|
|
if PtrUInt(P1) <> PtrUInt(P2) then
|
|
if (PtrUInt(P1) <> 0) and
|
|
(PtrUInt(P2) <> 0) then
|
|
begin
|
|
L := PStrLen(PAnsiChar(pointer(P1)) - _STRLEN)^;
|
|
result := (PStrLen(PAnsiChar(pointer(P2)) - _STRLEN)^ = L) and
|
|
((PByte(P1)^ xor PByte(P2)^) and $df = 0) and
|
|
IdemPropNameUSameLenNotNull(pointer(P1), pointer(P2), L);
|
|
end
|
|
else
|
|
result := false
|
|
else
|
|
result := true;
|
|
end;
|
|
|
|
function IdemPChar(p: PUtf8Char; up: PAnsiChar): boolean;
|
|
var
|
|
{$ifdef CPUX86NOTPIC}
|
|
table: TNormTable absolute NormToUpperAnsi7;
|
|
{$else}
|
|
table: PNormTable; // faster on PIC/ARM and x86_64
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
result := false;
|
|
if p = nil then
|
|
exit;
|
|
if up <> nil then
|
|
begin
|
|
dec(PtrUInt(p), PtrUInt(up));
|
|
{$ifndef CPUX86NOTPIC}
|
|
table := @NormToUpperAnsi7;
|
|
{$endif CPUX86NOTPIC}
|
|
while true do
|
|
if up^ = #0 then
|
|
break
|
|
else if table[up[PtrUInt(p)]] <> up^ then
|
|
exit
|
|
else
|
|
inc(up);
|
|
end;
|
|
result := true;
|
|
end;
|
|
|
|
function IdemPChar(p: PUtf8Char; up: PAnsiChar; table: PNormTable): boolean;
|
|
begin
|
|
result := false;
|
|
if p = nil then
|
|
exit;
|
|
if up <> nil then
|
|
begin
|
|
dec(PtrUInt(p), PtrUInt(up));
|
|
while true do
|
|
if up^ = #0 then
|
|
break
|
|
else if table[up[PtrUInt(p)]] <> up^ then
|
|
exit
|
|
else
|
|
inc(up);
|
|
end;
|
|
result := true;
|
|
end;
|
|
|
|
function IdemPCharAnsi(
|
|
{$ifdef CPUX86NOTPIC}
|
|
const table: TNormTable;
|
|
{$else}
|
|
const table: PNormTable;
|
|
{$endif CPUX86NOTPIC}
|
|
p: PUtf8Char; up: PAnsiChar): boolean;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
begin
|
|
// in this local IdemPChar() version, p and up are expected to be <> nil
|
|
result := false;
|
|
dec(PtrUInt(p), PtrUInt(up));
|
|
while true do
|
|
if up^ = #0 then
|
|
break
|
|
else if table[up[PtrUInt(p)]] <> up^ then
|
|
exit
|
|
else
|
|
inc(up);
|
|
result := true;
|
|
end;
|
|
|
|
function IdemPCharByte(
|
|
{$ifdef CPUX86NOTPIC}
|
|
const table: TNormTableByte;
|
|
{$else}
|
|
const table: PByteArray;
|
|
{$endif CPUX86NOTPIC}
|
|
p: PUtf8Char; up: PAnsiChar): boolean;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
begin
|
|
// in this local IdemPChar() version, p and up are expected to be <> nil
|
|
result := false;
|
|
dec(PtrUInt(p), PtrUInt(up));
|
|
while true do
|
|
if up^ = #0 then
|
|
break
|
|
else if table[PtrInt(up[PtrUInt(p)])] <> PByte(up)^ then
|
|
exit
|
|
else
|
|
inc(up);
|
|
result := true;
|
|
end;
|
|
|
|
function IdemPCharWithoutWhiteSpace(p: PUtf8Char; up: PAnsiChar): boolean;
|
|
begin
|
|
result := False;
|
|
if p = nil then
|
|
exit;
|
|
if up <> nil then
|
|
while up^ <> #0 do
|
|
begin
|
|
while p^ <= ' ' do // trim white space
|
|
if p^ = #0 then
|
|
exit
|
|
else
|
|
inc(p);
|
|
if up^ <> NormToUpperAnsi7[p^] then
|
|
exit;
|
|
inc(up);
|
|
inc(p);
|
|
end;
|
|
result := true;
|
|
end;
|
|
|
|
function IdemPCharArray(p: PUtf8Char; const upArray: array of PAnsiChar): integer;
|
|
var
|
|
w: word;
|
|
up: ^PAnsiChar;
|
|
{$ifdef CPUX86NOTPIC}
|
|
tab: TNormTableByte absolute NormToUpperAnsi7;
|
|
{$else}
|
|
tab: PByteArray; // faster on PIC/ARM and x86_64
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
if p <> nil then
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
tab := @NormToUpperAnsi7;
|
|
{$endif CPUX86NOTPIC}
|
|
w := tab[ord(p[0])] + tab[ord(p[1])] shl 8;
|
|
up := @upArray[0];
|
|
for result := 0 to high(upArray) do
|
|
if (PWord(up^)^ = w) and
|
|
IdemPCharByte(tab, p + 2, up^ + 2) then
|
|
exit
|
|
else
|
|
inc(up);
|
|
end;
|
|
result := -1;
|
|
end;
|
|
|
|
function IdemPPChar(p: PUtf8Char; up: PPAnsiChar): PtrInt;
|
|
var
|
|
w: word;
|
|
u: PAnsiChar;
|
|
p2: PtrUInt;
|
|
c: byte;
|
|
{$ifdef CPUX86NOTPIC}
|
|
tab: TNormTableByte absolute NormToUpperAnsi7;
|
|
{$else}
|
|
tab: PByteArray; // faster on PIC/ARM and x86_64
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
if p <> nil then
|
|
begin
|
|
// uppercase the first two p^ chars
|
|
{$ifndef CPUX86NOTPIC}
|
|
tab := @NormToUpperAnsi7;
|
|
{$endif CPUX86NOTPIC}
|
|
w := tab[ord(p[0])] + tab[ord(p[1])] shl 8;
|
|
result := 0;
|
|
repeat
|
|
// quickly check the first 2 up^[result] chars
|
|
u := PPointerArray(up)[result];
|
|
if u = nil then
|
|
break
|
|
else if PWord(u)^ <> w then
|
|
begin
|
|
inc(result);
|
|
continue;
|
|
end;
|
|
// inlined if IdemPCharByte(tab, p + 2, up^ + 2) then exit
|
|
p2 := PtrUInt(p);
|
|
dec(p2, PtrUInt(u));
|
|
inc(u, 2);
|
|
repeat
|
|
c := PByte(u)^;
|
|
if c = 0 then
|
|
exit // found IdemPChar(p^, up^[result])
|
|
else if tab[PtrUInt(u[p2])] <> c then
|
|
break; // at least one char doesn't match
|
|
inc(u);
|
|
until false;
|
|
inc(result);
|
|
until false;
|
|
end;
|
|
result := -1;
|
|
end;
|
|
|
|
function IdemPCharArrayBy2(p: PUtf8Char; const upArrayBy2Chars: RawUtf8): PtrInt;
|
|
begin
|
|
if p <> nil then
|
|
result := WordScanIndex(pointer(upArrayBy2Chars), length(upArrayBy2Chars) shr 1,
|
|
NormToUpperAnsi7Byte[ord(p[0])] + NormToUpperAnsi7Byte[ord(p[1])] shl 8)
|
|
else
|
|
result := -1;
|
|
end;
|
|
|
|
function IdemPCharU(p, up: PUtf8Char): boolean;
|
|
begin
|
|
result := false;
|
|
if (p = nil) or
|
|
(up = nil) then
|
|
exit;
|
|
while up^ <> #0 do
|
|
begin
|
|
if GetNextUtf8Upper(p) <> ord(up^) then
|
|
exit;
|
|
inc(up);
|
|
end;
|
|
result := true;
|
|
end;
|
|
|
|
function IdemPCharW(p: PWideChar; up: PUtf8Char): boolean;
|
|
begin
|
|
result := false;
|
|
if (p = nil) or
|
|
(up = nil) then
|
|
exit;
|
|
while up^ <> #0 do
|
|
begin
|
|
if (p^ > #255) or
|
|
(up^ <> AnsiChar(NormToUpperByte[ord(p^)])) then
|
|
exit;
|
|
inc(up);
|
|
inc(p);
|
|
end;
|
|
result := true;
|
|
end;
|
|
|
|
function StartWith(const text, upTextStart: RawUtf8): boolean;
|
|
begin
|
|
result := (PtrUInt(text) <> 0) and
|
|
(PtrUInt(upTextStart) <> 0) and
|
|
(PStrLen(PAnsiChar(pointer(text)) - _STRLEN)^ >=
|
|
PStrLen(PAnsiChar(pointer(upTextStart)) - _STRLEN)^) and
|
|
IdemPChar(pointer(text), pointer(upTextStart));
|
|
end;
|
|
|
|
function EndWith(const text, upTextEnd: RawUtf8): boolean;
|
|
var
|
|
o: PtrInt;
|
|
begin
|
|
o := length(text) - length(upTextEnd);
|
|
result := (o >= 0) and
|
|
IdemPChar(PUtf8Char(pointer(text)) + o, pointer(upTextEnd));
|
|
end;
|
|
|
|
function EndWithArray(const text: RawUtf8; const upArray: array of RawUtf8): integer;
|
|
var
|
|
t, o: PtrInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
tab: TNormTableByte absolute NormToUpperAnsi7;
|
|
{$else}
|
|
tab: PByteArray; // faster on PIC/ARM and x86_64
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
t := length(text);
|
|
if t > 0 then
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
tab := @NormToUpperAnsi7;
|
|
{$endif CPUX86NOTPIC}
|
|
for result := 0 to high(upArray) do
|
|
begin
|
|
o := t - length(upArray[result]);
|
|
if (o >= 0) and
|
|
((upArray[result] = '') or
|
|
IdemPCharByte(tab, PUtf8Char(pointer(text)) + o,
|
|
pointer(upArray[result]))) then
|
|
exit;
|
|
end;
|
|
end;
|
|
result := -1;
|
|
end;
|
|
|
|
function IdemFileExt(p: PUtf8Char; extup: PAnsiChar; sepChar: AnsiChar): boolean;
|
|
var
|
|
ext: PUtf8Char;
|
|
begin
|
|
if (p <> nil) and
|
|
(extup <> nil) then
|
|
begin
|
|
ext := nil;
|
|
repeat
|
|
if p^ = sepChar then
|
|
ext := p; // get last '.' position from p into ext
|
|
inc(p);
|
|
until p^ = #0;
|
|
result := IdemPChar(ext, extup);
|
|
end
|
|
else
|
|
result := false;
|
|
end;
|
|
|
|
function IdemFileExts(p: PUtf8Char; const extup: array of PAnsiChar;
|
|
sepChar: AnsiChar): integer;
|
|
var
|
|
ext: PUtf8Char;
|
|
begin
|
|
result := -1;
|
|
if (p <> nil) and
|
|
(high(extup) > 0) then
|
|
begin
|
|
ext := nil;
|
|
repeat
|
|
if p^ = sepChar then
|
|
ext := p; // get last '.' position from p into ext
|
|
inc(p);
|
|
until p^ = #0;
|
|
if ext <> nil then
|
|
result := IdemPCharArray(ext, extup);
|
|
end;
|
|
end;
|
|
|
|
function PosCharAny(Str: PUtf8Char; Characters: PAnsiChar): PUtf8Char;
|
|
var
|
|
s: PAnsiChar;
|
|
c: AnsiChar;
|
|
begin
|
|
if (Str <> nil) and
|
|
(Characters <> nil) and
|
|
(Characters^ <> #0) then
|
|
repeat
|
|
c := Str^;
|
|
if c = #0 then
|
|
break;
|
|
result := Str;
|
|
s := Characters;
|
|
repeat
|
|
if s^ = c then
|
|
exit;
|
|
inc(s);
|
|
until s^ = #0;
|
|
inc(Str);
|
|
until false;
|
|
result := nil;
|
|
end;
|
|
|
|
function PosI(uppersubstr: PUtf8Char; const str: RawUtf8): PtrInt;
|
|
var
|
|
u: AnsiChar;
|
|
{$ifdef CPUX86NOTPIC}
|
|
table: TNormTable absolute NormToUpperAnsi7;
|
|
{$else}
|
|
table: PNormTable;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
if uppersubstr <> nil then
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
table := @NormToUpperAnsi7;
|
|
{$endif CPUX86NOTPIC}
|
|
u := uppersubstr^;
|
|
for result := 1 to Length(str) do
|
|
if table[str[result]] = u then
|
|
if IdemPCharAnsi(table, @PUtf8Char(pointer(str))[result],
|
|
PAnsiChar(uppersubstr) + 1) then
|
|
exit;
|
|
end;
|
|
result := 0;
|
|
end;
|
|
|
|
function StrPosI(uppersubstr, str: PUtf8Char): PUtf8Char;
|
|
var
|
|
u: AnsiChar;
|
|
{$ifdef CPUX86NOTPIC}
|
|
table: TNormTable absolute NormToUpperAnsi7;
|
|
{$else}
|
|
table: PNormTable;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
if (uppersubstr <> nil) and
|
|
(str <> nil) then
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
table := @NormToUpperAnsi7;
|
|
{$endif CPUX86NOTPIC}
|
|
u := uppersubstr^;
|
|
inc(uppersubstr);
|
|
result := str;
|
|
while result^ <> #0 do
|
|
begin
|
|
if table[result^] = u then
|
|
if IdemPCharAnsi(table, result + 1, PAnsiChar(uppersubstr)) then
|
|
exit;
|
|
inc(result);
|
|
end;
|
|
end;
|
|
result := nil;
|
|
end;
|
|
|
|
function PosIU(substr: PUtf8Char; const str: RawUtf8): integer;
|
|
var
|
|
p: PUtf8Char;
|
|
begin
|
|
if (substr <> nil) and
|
|
(str <> '') then
|
|
begin
|
|
p := pointer(str);
|
|
repeat
|
|
if GetNextUtf8Upper(p) = ord(substr^) then
|
|
if IdemPCharU(p, substr + 1) then
|
|
begin
|
|
result := p - pointer(str);
|
|
exit;
|
|
end;
|
|
until p^ = #0;
|
|
end;
|
|
result := 0;
|
|
end;
|
|
|
|
function strspn(s, accept: pointer): integer;
|
|
// FPC is efficient at compiling this code, but is SLOWER when inlined
|
|
var
|
|
p: PCardinal;
|
|
c: AnsiChar;
|
|
d: cardinal;
|
|
begin
|
|
// returns size of initial segment of s which are in accept
|
|
result := 0;
|
|
repeat
|
|
c := PAnsiChar(s)[result];
|
|
if c = #0 then
|
|
break;
|
|
p := accept;
|
|
repeat // stop as soon as we find any character not from accept
|
|
d := p^;
|
|
inc(p);
|
|
if AnsiChar(d) = c then
|
|
break
|
|
else if AnsiChar(d) = #0 then
|
|
exit;
|
|
d := d shr 8;
|
|
if AnsiChar(d) = c then
|
|
break
|
|
else if AnsiChar(d) = #0 then
|
|
exit;
|
|
d := d shr 8;
|
|
if AnsiChar(d) = c then
|
|
break
|
|
else if AnsiChar(d) = #0 then
|
|
exit;
|
|
d := d shr 8;
|
|
if AnsiChar(d) = c then
|
|
break
|
|
else if AnsiChar(d) = #0 then
|
|
exit;
|
|
until false;
|
|
inc(result);
|
|
until false;
|
|
end;
|
|
|
|
function strcspn(s, reject: pointer): integer;
|
|
// FPC is efficient at compiling this code, but is SLOWER when inlined
|
|
var
|
|
p: PCardinal;
|
|
c: AnsiChar;
|
|
d: cardinal;
|
|
begin
|
|
// returns size of initial segment of s which are not in reject
|
|
result := 0;
|
|
repeat
|
|
c := PAnsiChar(s)[result];
|
|
if c = #0 then
|
|
break;
|
|
p := reject;
|
|
repeat // stop as soon as we find any character from reject
|
|
d := p^;
|
|
inc(p);
|
|
if AnsiChar(d) = c then
|
|
exit
|
|
else if AnsiChar(d) = #0 then
|
|
break;
|
|
d := d shr 8;
|
|
if AnsiChar(d) = c then
|
|
exit
|
|
else if AnsiChar(d) = #0 then
|
|
break;
|
|
d := d shr 8;
|
|
if AnsiChar(d) = c then
|
|
exit
|
|
else if AnsiChar(d) = #0 then
|
|
break;
|
|
d := d shr 8;
|
|
if AnsiChar(d) = c then
|
|
exit
|
|
else if AnsiChar(d) = #0 then
|
|
break;
|
|
until false;
|
|
inc(result);
|
|
until false;
|
|
end;
|
|
|
|
function StrCompL(P1, P2: pointer; L, Default: PtrInt): PtrInt;
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
i := 0;
|
|
repeat
|
|
result := PByteArray(P1)[i] - PByteArray(P2)[i];
|
|
if result = 0 then
|
|
begin
|
|
inc(i);
|
|
if i < L then
|
|
continue
|
|
else
|
|
break;
|
|
end;
|
|
exit;
|
|
until false;
|
|
result := Default;
|
|
end;
|
|
|
|
function StrCompIL(P1, P2: pointer; L, Default: PtrInt): PtrInt;
|
|
var
|
|
i: PtrInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
tab: TNormTableByte absolute NormToUpperAnsi7Byte;
|
|
{$else}
|
|
tab: PByteArray; // faster on PIC/ARM and x86_64
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
i := 0;
|
|
{$ifndef CPUX86NOTPIC}
|
|
tab := @NormToUpperAnsi7Byte;
|
|
{$endif CPUX86NOTPIC}
|
|
repeat
|
|
if tab[PByteArray(P1)[i]] = tab[PByteArray(P2)[i]] then
|
|
begin
|
|
inc(i);
|
|
if i < L then
|
|
continue
|
|
else
|
|
break;
|
|
end;
|
|
result := PByteArray(P1)[i] - PByteArray(P2)[i];
|
|
exit;
|
|
until false;
|
|
result := Default;
|
|
end;
|
|
|
|
function StrICompNotNil(Str1, Str2: pointer; Up: PNormTableByte): PtrInt;
|
|
var
|
|
C1, C2: byte; // integer/PtrInt are actually slower on FPC
|
|
begin
|
|
result := PtrInt(PtrUInt(Str2)) - PtrInt(PtrUInt(Str1));
|
|
if result <> 0 then
|
|
begin
|
|
repeat
|
|
C1 := Up[PByteArray(Str1)[0]];
|
|
C2 := Up[PByteArray(Str1)[result]];
|
|
inc(PByte(Str1));
|
|
until (C1 = 0) or
|
|
(C1 <> C2);
|
|
result := C1 - C2;
|
|
end;
|
|
end;
|
|
|
|
function StrICompLNotNil(Str1, Str2: pointer; Up: PNormTableByte; L: PtrInt): PtrInt;
|
|
begin
|
|
result := 0;
|
|
repeat
|
|
if Up[PByteArray(Str1)[result]] = Up[PByteArray(Str2)[result]] then
|
|
begin
|
|
inc(result);
|
|
if result < L then
|
|
continue
|
|
else
|
|
break;
|
|
end;
|
|
result := PByteArray(Str1)[result] - PByteArray(Str2)[result];
|
|
exit;
|
|
until false;
|
|
result := 0;
|
|
end;
|
|
|
|
function StrILNotNil(Str1, Str2: pointer; Up: PNormTableByte; L: PtrInt): PtrInt;
|
|
begin
|
|
result := 0;
|
|
repeat
|
|
if Up[PByteArray(Str1)[result]] <> Up[PByteArray(Str2)[result]] then
|
|
exit;
|
|
inc(result);
|
|
until result = L;
|
|
end;
|
|
|
|
|
|
function StrIComp(Str1, Str2: pointer): PtrInt;
|
|
var
|
|
C1, C2: byte; // integer/PtrInt are actually slower on FPC
|
|
{$ifdef CPUX86NOTPIC}
|
|
table: TNormTableByte absolute NormToUpperAnsi7Byte;
|
|
{$else}
|
|
table: PByteArray;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
result := PtrInt(PtrUInt(Str2)) - PtrInt(PtrUInt(Str1));
|
|
if result <> 0 then
|
|
if Str1 <> nil then
|
|
if Str2 <> nil then
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
table := @NormToUpperAnsi7Byte;
|
|
{$endif CPUX86NOTPIC}
|
|
repeat
|
|
C1 := table[PByteArray(Str1)[0]];
|
|
C2 := table[PByteArray(Str1)[result]];
|
|
inc(PByte(Str1));
|
|
until (C1 = 0) or
|
|
(C1 <> C2);
|
|
result := C1 - C2;
|
|
end
|
|
else
|
|
// Str2=''
|
|
result := 1
|
|
else
|
|
// Str1=''
|
|
result := -1;
|
|
end;
|
|
|
|
function GetLineContains(p, pEnd, up: PUtf8Char): boolean;
|
|
var
|
|
i: PtrInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
table: TNormTable absolute NormToUpperAnsi7Byte;
|
|
{$else}
|
|
table: PNormTable;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
Fnd1, LF1, Fnd2, LF2, Ok; // ugly but fast
|
|
begin
|
|
if (p <> nil) and
|
|
(up <> nil) then
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
table := @NormToUpperAnsi7;
|
|
{$endif CPUX86NOTPIC}
|
|
if pEnd = nil then
|
|
repeat
|
|
if p^ <= #13 then // p^ into a temp var is slower
|
|
goto LF1
|
|
else if table[p^] = up^ then
|
|
goto Fnd1;
|
|
inc(p);
|
|
continue;
|
|
LF1: if (p^ = #0) or
|
|
(p^ = #13) or
|
|
(p^ = #10) then
|
|
break;
|
|
inc(p);
|
|
continue;
|
|
Fnd1: i := 0;
|
|
repeat
|
|
inc(i);
|
|
if up[i] <> #0 then
|
|
if up[i] = table[p[i]] then
|
|
continue
|
|
else
|
|
break
|
|
else
|
|
begin
|
|
Ok: result := true; // found
|
|
exit;
|
|
end;
|
|
until false;
|
|
inc(p);
|
|
until false
|
|
else
|
|
repeat
|
|
if p >= pEnd then
|
|
break;
|
|
if p^ <= #13 then
|
|
goto LF2
|
|
else if table[p^] = up^ then
|
|
goto Fnd2;
|
|
inc(p);
|
|
continue;
|
|
LF2: if (p^ = #13) or
|
|
(p^ = #10) then
|
|
break;
|
|
inc(p);
|
|
continue;
|
|
Fnd2: i := 0;
|
|
repeat
|
|
inc(i);
|
|
if up[i] = #0 then
|
|
goto Ok;
|
|
if p + i >= pEnd then
|
|
break;
|
|
until up[i] <> table[p[i]];
|
|
inc(p);
|
|
until false;
|
|
end;
|
|
result := false;
|
|
end;
|
|
|
|
function ContainsUtf8(p, up: PUtf8Char): boolean;
|
|
var
|
|
u: PByte;
|
|
begin
|
|
if (p <> nil) and
|
|
(up <> nil) and
|
|
(up^ <> #0) then
|
|
begin
|
|
result := true;
|
|
repeat
|
|
u := pointer(up);
|
|
repeat
|
|
if GetNextUtf8Upper(p) <> u^ then
|
|
break
|
|
else
|
|
inc(u);
|
|
if u^ = 0 then
|
|
exit; // up^ was found inside p^
|
|
until false;
|
|
p := FindNextUtf8WordBegin(p);
|
|
until p = nil;
|
|
end;
|
|
result := false;
|
|
end;
|
|
|
|
function GetNextUtf8Upper(var U: PUtf8Char): Ucs4CodePoint;
|
|
begin
|
|
result := ord(U^);
|
|
if result = 0 then
|
|
exit;
|
|
if result <= 127 then
|
|
begin
|
|
inc(U);
|
|
result := NormToUpperByte[result];
|
|
exit;
|
|
end;
|
|
result := UTF8_TABLE.GetHighUtf8Ucs4(U);
|
|
if (result <= 255) and
|
|
(WinAnsiConvert.AnsiToWide[result] <= 255) then
|
|
result := NormToUpperByte[result];
|
|
end;
|
|
|
|
function FindNextUtf8WordBegin(U: PUtf8Char): PUtf8Char;
|
|
var
|
|
c: cardinal;
|
|
V: PUtf8Char;
|
|
begin
|
|
result := nil;
|
|
repeat
|
|
c := GetNextUtf8Upper(U);
|
|
if c = 0 then
|
|
exit;
|
|
until (c >= 127) or
|
|
not (tcWord in TEXT_BYTES[c]); // not ['0'..'9', 'a'..'z', 'A'..'Z']
|
|
repeat
|
|
V := U;
|
|
c := GetNextUtf8Upper(U);
|
|
if c = 0 then
|
|
exit;
|
|
until (c < 127) and
|
|
(tcWord in TEXT_BYTES[c]);
|
|
result := V;
|
|
end;
|
|
|
|
function AnsiICompW(u1, u2: PWideChar): PtrInt;
|
|
var
|
|
C1, C2: PtrInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
table: TNormTableByte absolute NormToUpperAnsi7Byte;
|
|
{$else}
|
|
table: PByteArray;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
if u1 <> u2 then
|
|
if u1 <> nil then
|
|
if u2 <> nil then
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
table := @NormToUpperAnsi7Byte;
|
|
{$endif CPUX86NOTPIC}
|
|
repeat
|
|
C1 := PtrInt(u1^);
|
|
C2 := PtrInt(u2^);
|
|
result := C1 - C2;
|
|
if result <> 0 then
|
|
begin
|
|
if (C1 > 255) or
|
|
(C2 > 255) then
|
|
exit;
|
|
result := table[C1] - table[C2];
|
|
if result <> 0 then
|
|
exit;
|
|
end;
|
|
if (C1 = 0) or
|
|
(C2 = 0) then
|
|
break;
|
|
inc(u1);
|
|
inc(u2);
|
|
until false;
|
|
end
|
|
else
|
|
result := 1
|
|
else // u2=''
|
|
result := -1
|
|
else // u1=''
|
|
result := 0; // u1=u2
|
|
end;
|
|
|
|
function AnsiIComp(Str1, Str2: pointer): PtrInt;
|
|
var
|
|
C1, C2: byte; // integer/PtrInt are actually slower on FPC
|
|
lookupper: PByteArray; // better x86-64 / PIC asm generation
|
|
begin
|
|
result := PtrInt(PtrUInt(Str2)) - PtrInt(PtrUInt(Str1));
|
|
if result <> 0 then
|
|
if Str1 <> nil then
|
|
if Str2 <> nil then
|
|
begin
|
|
lookupper := @NormToUpperByte;
|
|
repeat
|
|
C1 := lookupper[PByteArray(Str1)[0]];
|
|
C2 := lookupper[PByteArray(Str1)[result]];
|
|
inc(PByte(Str1));
|
|
until (C1 = 0) or
|
|
(C1 <> C2);
|
|
result := C1 - C2;
|
|
end
|
|
else
|
|
result := 1
|
|
else // Str2=''
|
|
result := -1; // Str1=''
|
|
end;
|
|
|
|
function SortDynArrayAnsiStringI(const A, B): integer;
|
|
begin
|
|
result := StrIComp(PUtf8Char(A), PUtf8Char(B)); // very agressively inlined
|
|
end;
|
|
|
|
function SortDynArrayPUtf8CharI(const A, B): integer;
|
|
begin
|
|
result := StrIComp(PUtf8Char(A), PUtf8Char(B));
|
|
end;
|
|
|
|
function SortDynArrayStringI(const A, B): integer;
|
|
begin
|
|
{$ifdef UNICODE}
|
|
result := AnsiICompW(PWideChar(A), PWideChar(B));
|
|
{$else}
|
|
result := StrIComp(PUtf8Char(A), PUtf8Char(B));
|
|
{$endif UNICODE}
|
|
end;
|
|
|
|
function SortDynArrayUnicodeStringI(const A, B): integer;
|
|
begin
|
|
result := AnsiICompW(PWideChar(A), PWideChar(B));
|
|
end;
|
|
|
|
function ConvertCaseUtf8(P: PUtf8Char; const Table: TNormTableByte): PtrInt;
|
|
var
|
|
D, S: PUtf8Char;
|
|
c: PtrUInt;
|
|
extra, i: PtrInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
result := 0;
|
|
if P = nil then
|
|
exit;
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
D := P;
|
|
repeat
|
|
c := byte(P[0]);
|
|
inc(P);
|
|
if c = 0 then
|
|
break;
|
|
if c <= 127 then
|
|
begin
|
|
D[result] := AnsiChar(Table[c]);
|
|
inc(result);
|
|
end
|
|
else
|
|
begin
|
|
extra := utf8.Lookup[c];
|
|
if extra = UTF8_INVALID then
|
|
exit; // invalid leading byte
|
|
i := 0;
|
|
repeat
|
|
if byte(P[i]) and $c0 <> $80 then
|
|
exit; // invalid input content
|
|
c := (c shl 6) + byte(P[i]);
|
|
inc(i);
|
|
until i = extra;
|
|
with utf8.Extra[extra] do
|
|
begin
|
|
dec(c, offset);
|
|
if c < minimum then
|
|
exit; // invalid input content
|
|
end;
|
|
if (c <= 255) and
|
|
(Table[c] <= 127) then
|
|
begin
|
|
D[result] := AnsiChar(Table[c]);
|
|
inc(result);
|
|
inc(P, extra);
|
|
continue;
|
|
end;
|
|
S := P - 1;
|
|
inc(P, extra);
|
|
inc(extra);
|
|
MoveByOne(S, D + result, extra);
|
|
inc(result, extra);
|
|
end;
|
|
until false;
|
|
end;
|
|
|
|
function UpperCaseU(const S: RawUtf8): RawUtf8;
|
|
var
|
|
LS, LD: integer;
|
|
begin
|
|
LS := length(S);
|
|
FastSetString(result, pointer(S), LS);
|
|
LD := ConvertCaseUtf8(pointer(result), NormToUpperByte);
|
|
if LS <> LD then
|
|
SetLength(result, LD);
|
|
end;
|
|
|
|
function LowerCaseU(const S: RawUtf8): RawUtf8;
|
|
var
|
|
LS, LD: integer;
|
|
begin
|
|
LS := length(S);
|
|
FastSetString(result, pointer(S), LS);
|
|
LD := ConvertCaseUtf8(pointer(result), NormToLowerByte);
|
|
if LS <> LD then
|
|
SetLength(result, LD);
|
|
end;
|
|
|
|
function Utf8IComp(u1, u2: PUtf8Char): PtrInt;
|
|
var
|
|
c2: PtrInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
table: TNormTableByte absolute NormToUpperByte;
|
|
{$else}
|
|
table: PByteArray;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
c2low;
|
|
begin
|
|
// fast UTF-8 comparison using the NormToUpper[] array for all 8-bit values
|
|
{$ifndef CPUX86NOTPIC}
|
|
table := @NormToUpperByte;
|
|
{$endif CPUX86NOTPIC}
|
|
if u1 <> u2 then
|
|
if u1 <> nil then
|
|
if u2 <> nil then
|
|
repeat
|
|
result := ord(u1^);
|
|
c2 := ord(u2^);
|
|
if result <= 127 then
|
|
if result <> 0 then
|
|
begin
|
|
inc(u1);
|
|
result := table[result];
|
|
if c2 <= 127 then
|
|
begin
|
|
c2low: if c2 = 0 then
|
|
exit; // u1>u2 -> return u1^
|
|
inc(u2);
|
|
dec(result, table[c2]);
|
|
if result <> 0 then
|
|
exit;
|
|
continue;
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
// u1^=#0 -> end of u1 reached
|
|
if c2 <> 0 then // end of u2 reached -> u1=u2 -> return 0
|
|
result := -1; // u1<u2
|
|
exit;
|
|
end
|
|
else
|
|
begin
|
|
if result and $20 = 0 then // fast $0..$7ff process
|
|
begin
|
|
result := (result shl 6) + byte(u1[1]) - $3080;
|
|
inc(u1, 2);
|
|
end
|
|
else
|
|
result := UTF8_TABLE.GetHighUtf8Ucs4(u1);
|
|
if result <= 255 then
|
|
result := table[result]; // 8-bit to upper, 32-bit as is
|
|
end;
|
|
if c2 <= 127 then
|
|
goto c2low
|
|
else if c2 and $20 = 0 then // fast $0..$7ff process
|
|
begin
|
|
c2 := (c2 shl 6) + byte(u2[1]) - $3080;
|
|
inc(u2, 2);
|
|
end
|
|
else
|
|
c2 := UTF8_TABLE.GetHighUtf8Ucs4(u2);
|
|
if c2 <= 255 then
|
|
c2 := table[c2]; // 8-bit to upper
|
|
dec(result, c2);
|
|
if result <> 0 then
|
|
exit;
|
|
until false
|
|
else
|
|
result := 1 // u2=''
|
|
else
|
|
result := -1 // u1=''
|
|
else
|
|
result := 0; // u1=u2
|
|
end;
|
|
|
|
function Utf8ILComp(u1, u2: PUtf8Char; L1, L2: cardinal): PtrInt;
|
|
var
|
|
c2: PtrInt;
|
|
extra, i: integer;
|
|
{$ifdef CPUX86NOTPIC}
|
|
table: TNormTableByte absolute NormToUpperByte;
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
table: PByteArray;
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
neg, pos;
|
|
begin
|
|
// fast UTF-8 comparison using the NormToUpper[] array for all 8-bit values
|
|
{$ifndef CPUX86NOTPIC}
|
|
table := @NormToUpperByte;
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
if u1 <> u2 then
|
|
if (u1 <> nil) and
|
|
(L1 <> 0) then
|
|
if (u2 <> nil) and
|
|
(L2 <> 0) then
|
|
repeat
|
|
result := ord(u1^);
|
|
c2 := ord(u2^);
|
|
inc(u1);
|
|
dec(L1);
|
|
if result <= 127 then
|
|
begin
|
|
result := table[result];
|
|
if c2 <= 127 then
|
|
begin
|
|
// 'a'..'z' / 'A'..'Z' case insensitive comparison
|
|
dec(result, table[c2]);
|
|
dec(L2);
|
|
inc(u2);
|
|
if result <> 0 then
|
|
// found unmatching char
|
|
exit
|
|
else if L1 <> 0 then
|
|
if L2 <> 0 then
|
|
// L1>0 and L2>0 -> next char
|
|
continue
|
|
else
|
|
// L1>0 and L2=0 -> u1>u2
|
|
goto pos
|
|
else
|
|
if L2 <> 0 then
|
|
// L1=0 and L2>0 -> u1<u2
|
|
goto neg
|
|
else
|
|
// L1=0 and L2=0 -> u1=u2 -> returns 0
|
|
exit;
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
// Win-1252 case insensitive comparison
|
|
extra := utf8.Lookup[result];
|
|
if extra = UTF8_INVALID then
|
|
// invalid leading byte
|
|
goto neg;
|
|
dec(L1, extra);
|
|
if integer(L1) < 0 then
|
|
goto neg;
|
|
i := 0;
|
|
repeat
|
|
result := result shl 6;
|
|
inc(result, ord(u1[i]));
|
|
inc(i);
|
|
until i = extra;
|
|
inc(u1, extra);
|
|
dec(result, utf8.Extra[extra].offset);
|
|
if result and $ffffff00 = 0 then
|
|
// 8-bit to upper conversion, 32-bit as is
|
|
result := table[result];
|
|
end;
|
|
// here result=NormToUpper[u1^]
|
|
inc(u2);
|
|
dec(L2);
|
|
if c2 <= 127 then
|
|
begin
|
|
dec(result, table[c2]);
|
|
if result <> 0 then
|
|
// found unmatching char
|
|
exit;
|
|
end
|
|
else
|
|
begin
|
|
extra := utf8.Lookup[c2];
|
|
if extra = UTF8_INVALID then
|
|
goto pos;
|
|
dec(L2, extra);
|
|
if integer(L2) < 0 then
|
|
goto pos;
|
|
i := 0;
|
|
repeat
|
|
c2 := c2 shl 6;
|
|
inc(c2, ord(u2[i]));
|
|
inc(i);
|
|
until i = extra;
|
|
inc(u2, extra);
|
|
dec(c2, utf8.Extra[extra].offset);
|
|
if c2 and $ffffff00 = 0 then
|
|
// 8-bit to upper
|
|
dec(result, table[c2])
|
|
else
|
|
// returns 32-bit diff
|
|
dec(result, c2);
|
|
if result <> 0 then
|
|
// found unmatching char
|
|
exit;
|
|
end;
|
|
// here we have result=NormToUpper[u2^]-NormToUpper[u1^]=0
|
|
if L1 = 0 then
|
|
// test if we reached end of u1 or end of u2
|
|
if L2 = 0 then
|
|
// u1=u2
|
|
exit
|
|
else
|
|
// u1<u2
|
|
goto neg
|
|
else
|
|
if L2 = 0 then
|
|
// u1>u2
|
|
goto pos;
|
|
until false
|
|
else
|
|
pos: // u2='' or u1>u2
|
|
result := 1
|
|
else
|
|
neg: // u1='' or u1<u2
|
|
result := -1
|
|
else
|
|
// u1=u2
|
|
result := 0;
|
|
end;
|
|
|
|
function SameTextU(const S1, S2: RawUtf8): boolean;
|
|
// checking UTF-8 lengths is not accurate: surrogates may be confusing
|
|
begin
|
|
result := Utf8IComp(pointer(S1), pointer(S2)) = 0;
|
|
end;
|
|
|
|
function FindAnsi(A, UpperValue: PAnsiChar): boolean;
|
|
var
|
|
ValueStart: PAnsiChar;
|
|
begin
|
|
result := false;
|
|
if (A = nil) or
|
|
(UpperValue = nil) then
|
|
exit;
|
|
ValueStart := UpperValue;
|
|
repeat
|
|
// test beginning of word
|
|
repeat
|
|
if A^ = #0 then
|
|
exit
|
|
else if tcWord in TEXT_CHARS[NormToUpper[A^]] then
|
|
break
|
|
else
|
|
inc(A);
|
|
until false;
|
|
// check if this word is the UpperValue
|
|
UpperValue := ValueStart;
|
|
repeat
|
|
if NormToUpper[A^] <> UpperValue^ then
|
|
break;
|
|
inc(UpperValue);
|
|
if UpperValue^ = #0 then
|
|
begin
|
|
result := true; // UpperValue found!
|
|
exit;
|
|
end;
|
|
inc(A);
|
|
if A^ = #0 then
|
|
exit;
|
|
until false;
|
|
// find beginning of next word
|
|
repeat
|
|
if A^ = #0 then
|
|
exit
|
|
else if not (tcWord in TEXT_CHARS[NormToUpper[A^]]) then
|
|
break
|
|
else
|
|
inc(A);
|
|
until false;
|
|
until false;
|
|
end;
|
|
|
|
function FindUnicode(PW, Upper: PWideChar; UpperLen: PtrInt): boolean;
|
|
var
|
|
Start: PWideChar;
|
|
w: PtrUInt;
|
|
begin
|
|
result := false;
|
|
if (PW = nil) or
|
|
(Upper = nil) then
|
|
exit;
|
|
repeat
|
|
// go to beginning of next word
|
|
repeat
|
|
w := ord(PW^);
|
|
if w = 0 then
|
|
exit
|
|
else if (w > 126) or
|
|
(tcWord in TEXT_BYTES[w]) then
|
|
Break;
|
|
inc(PW);
|
|
until false;
|
|
Start := PW;
|
|
// search end of word matching UpperLen characters
|
|
repeat
|
|
inc(PW);
|
|
w := ord(PW^);
|
|
until (PW - Start >= UpperLen) or
|
|
(w = 0) or
|
|
((w < 127) and
|
|
not (tcWord in TEXT_BYTES[w]));
|
|
if PW - Start >= UpperLen then
|
|
if Unicode_CompareString(Start, Upper, UpperLen, UpperLen,
|
|
{ignorecase=}true) = 2 then
|
|
begin
|
|
result := true; // case-insensitive match found
|
|
exit;
|
|
end;
|
|
// not found: go to end of current word
|
|
repeat
|
|
w := ord(PW^);
|
|
if w = 0 then
|
|
exit
|
|
else if ((w < 127) and
|
|
not (tcWord in TEXT_BYTES[w])) then
|
|
Break;
|
|
inc(PW);
|
|
until false;
|
|
until false;
|
|
end;
|
|
|
|
function FindUtf8(U: PUtf8Char; UpperValue: PAnsiChar): boolean;
|
|
var
|
|
ValueStart: PAnsiChar;
|
|
c: PtrUInt;
|
|
FirstChar: AnsiChar;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
Next;
|
|
begin
|
|
result := false;
|
|
if (U = nil) or
|
|
(UpperValue = nil) then
|
|
exit;
|
|
// handles 8-bits WinAnsi chars inside UTF-8 encoded data
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
FirstChar := UpperValue^;
|
|
ValueStart := UpperValue + 1;
|
|
repeat
|
|
// test beginning of word
|
|
repeat
|
|
c := byte(U^);
|
|
inc(U);
|
|
if c = 0 then
|
|
exit;
|
|
if c <= 127 then
|
|
begin
|
|
if tcWord in TEXT_BYTES[c] then
|
|
if PAnsiChar(@NormToUpper)[c] <> FirstChar then
|
|
goto Next
|
|
else
|
|
break;
|
|
end
|
|
else if c and $20 = 0 then
|
|
begin
|
|
// fast direct process of $0..$7ff codepoints including accents
|
|
c := ((c shl 6) + byte(U^)) - $3080;
|
|
inc(U);
|
|
if c <= 255 then
|
|
begin
|
|
c := NormToUpperByte[c];
|
|
if tcWord in TEXT_BYTES[c] then
|
|
if AnsiChar(c) <> FirstChar then
|
|
goto Next
|
|
else
|
|
break;
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
c := utf8.Lookup[c];
|
|
if c = UTF8_INVALID then
|
|
exit
|
|
else
|
|
// just ignore surrogates for soundex
|
|
inc(U, c);
|
|
end;
|
|
until false;
|
|
// here we had the first char match -> check if this word match UpperValue
|
|
UpperValue := ValueStart;
|
|
repeat
|
|
if UpperValue^ = #0 then
|
|
begin
|
|
result := true; // UpperValue found!
|
|
exit;
|
|
end;
|
|
c := byte(U^);
|
|
inc(U); // next chars
|
|
if c = 0 then
|
|
exit
|
|
else if c <= 127 then
|
|
begin
|
|
if PAnsiChar(@NormToUpper)[c] <> UpperValue^ then
|
|
break;
|
|
end
|
|
else if c and $20 = 0 then
|
|
begin
|
|
c := ((c shl 6) + byte(U^)) - $3080;
|
|
inc(U);
|
|
if (c > 255) or
|
|
(PAnsiChar(@NormToUpper)[c] <> UpperValue^) then
|
|
break;
|
|
end
|
|
else
|
|
begin
|
|
c := utf8.Lookup[c];
|
|
if c = UTF8_INVALID then
|
|
exit
|
|
else
|
|
inc(U, c);
|
|
break;
|
|
end;
|
|
inc(UpperValue);
|
|
until false;
|
|
Next: // find beginning of next word
|
|
U := FindNextUtf8WordBegin(U);
|
|
until U = nil;
|
|
end;
|
|
|
|
function UpperCopy255(dest: PAnsiChar; const source: RawUtf8): PAnsiChar;
|
|
begin
|
|
if source <> '' then
|
|
result := UpperCopy255Buf(
|
|
dest, pointer(source), PStrLen(PAnsiChar(pointer(source)) - _STRLEN)^)
|
|
else
|
|
result := dest;
|
|
end;
|
|
|
|
function UpperCopy255Buf(dest: PAnsiChar; source: PUtf8Char; sourceLen: PtrInt): PAnsiChar;
|
|
var
|
|
i, c, d {$ifdef CPU64}, _80, _61, _7b {$endif}: PtrUInt;
|
|
begin
|
|
if sourceLen <> 0 then
|
|
begin
|
|
if sourceLen > 248 then
|
|
sourceLen := 248; // avoid buffer overflow
|
|
// we allow to copy up to 3/7 more chars in Dest^ since its size is 255
|
|
{$ifdef CPU64}
|
|
// unbranched uppercase conversion of 8 chars blocks
|
|
_80 := PtrUInt($8080808080808080); // use registers for constants
|
|
_61 := $6161616161616161;
|
|
_7b := $7b7b7b7b7b7b7b7b;
|
|
for i := 0 to (sourceLen - 1) shr 3 do
|
|
begin
|
|
c := PPtrUIntArray(source)^[i];
|
|
d := c or _80;
|
|
PPtrUIntArray(dest)^[i] := c - ((d - _61) and
|
|
not (d - _7b)) and ((not c) and _80) shr 2;
|
|
end;
|
|
{$else}
|
|
// unbranched uppercase conversion of 4 chars blocks
|
|
for i := 0 to (sourceLen - 1) shr 2 do
|
|
begin
|
|
c := PPtrUIntArray(source)^[i];
|
|
d := c or PtrUInt($80808080);
|
|
PPtrUIntArray(dest)^[i] := c - ((d - PtrUInt($61616161)) and
|
|
not(d - PtrUInt($7b7b7b7b))) and ((not c) and PtrUInt($80808080)) shr 2;
|
|
end;
|
|
{$endif CPU64}
|
|
end;
|
|
result := dest + sourceLen; // return the exact size
|
|
end;
|
|
|
|
function UpperCopyWin255(dest: PWinAnsiChar; const source: RawUtf8): PWinAnsiChar;
|
|
var
|
|
i, L: PtrInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
tab: TNormTableByte absolute NormToUpperByte;
|
|
{$else}
|
|
tab: PByteArray; // faster on PIC/ARM and x86_64
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
if source = '' then
|
|
result := dest
|
|
else
|
|
begin
|
|
L := PStrLen(PAnsiChar(pointer(source)) - _STRLEN)^;
|
|
if L > 250 then
|
|
L := 250; // avoid buffer overflow
|
|
result := dest + L;
|
|
{$ifndef CPUX86NOTPIC}
|
|
tab := @NormToUpperByte;
|
|
{$endif CPUX86NOTPIC}
|
|
for i := 0 to L - 1 do
|
|
dest[i] := AnsiChar(tab[PByteArray(source)[i]]);
|
|
end;
|
|
end;
|
|
|
|
function Utf8UpperCopy(Dest, Source: PUtf8Char; SourceChars: cardinal): PUtf8Char;
|
|
var
|
|
c: cardinal;
|
|
endSource, endSourceBy4, up: PUtf8Char;
|
|
extra, i: PtrInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
by1, by4, set1; // ugly but faster
|
|
begin
|
|
if (Source <> nil) and
|
|
(Dest <> nil) then
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
// first handle trailing 7-bit ASCII chars, by quad (Sha optimization)
|
|
endSource := Source + SourceChars;
|
|
endSourceBy4 := endSource - 4;
|
|
up := @NormToUpper;
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif}
|
|
(Source <= endSourceBy4) then
|
|
repeat
|
|
c := PCardinal(Source)^;
|
|
if c and $80808080 <> 0 then
|
|
goto by1; // break on first non ASCII quad
|
|
by4: inc(Source, 4);
|
|
Dest[0] := up[ToByte(c)];
|
|
Dest[1] := up[ToByte(c shr 8)];
|
|
Dest[2] := up[ToByte(c shr 16)];
|
|
Dest[3] := up[ToByte(c shr 24)];
|
|
inc(Dest, 4);
|
|
until Source > endSourceBy4;
|
|
// generic loop, handling one UCS4 CodePoint per iteration
|
|
if Source < endSource then
|
|
repeat
|
|
by1: c := byte(Source^);
|
|
inc(Source);
|
|
if c <= 127 then
|
|
begin
|
|
Dest^ := up[c];
|
|
set1: inc(Dest);
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(Source) and 3 = 0) and{$endif}
|
|
(Source <= endSourceBy4) then
|
|
begin
|
|
c := PCardinal(Source)^;
|
|
if c and $80808080 = 0 then
|
|
goto by4;
|
|
continue;
|
|
end
|
|
else if Source < endSource then
|
|
continue
|
|
else
|
|
break;
|
|
end
|
|
else
|
|
begin
|
|
extra := utf8.Lookup[c];
|
|
if (extra = UTF8_INVALID) or
|
|
(Source + extra > endSource) then
|
|
break;
|
|
i := 0;
|
|
repeat
|
|
c := (c shl 6) + byte(Source[i]);
|
|
inc(i)
|
|
until i = extra;
|
|
with utf8.Extra[extra] do
|
|
begin
|
|
dec(c, offset);
|
|
if c < minimum then
|
|
break; // invalid input content
|
|
end;
|
|
if (c <= 255) and
|
|
(up[c] <= #127) then
|
|
begin
|
|
Dest^ := up[c];
|
|
inc(Source, extra);
|
|
goto set1;
|
|
end;
|
|
Dest^ := Source[-1];
|
|
repeat // here we now extra>0 - just copy UTF-8 input untouched
|
|
inc(Dest);
|
|
Dest^ := Source^;
|
|
inc(Source);
|
|
dec(extra);
|
|
if extra = 0 then
|
|
goto set1;
|
|
until false;
|
|
end;
|
|
until false;
|
|
end;
|
|
result := Dest;
|
|
end;
|
|
|
|
function Utf8UpperCopy255(dest: PAnsiChar; const source: RawUtf8): PUtf8Char;
|
|
var
|
|
L: integer;
|
|
begin
|
|
L := length(source);
|
|
if L > 0 then
|
|
begin
|
|
if L > 250 then
|
|
L := 250; // avoid buffer overflow
|
|
result := Utf8UpperCopy(pointer(dest), pointer(source), L);
|
|
end
|
|
else
|
|
result := pointer(dest);
|
|
end;
|
|
|
|
function UpperCopy255W(dest: PAnsiChar; const source: SynUnicode): PAnsiChar;
|
|
begin
|
|
result := UpperCopy255W(dest, pointer(source), length(source));
|
|
end;
|
|
|
|
function UpperCopy255W(dest: PAnsiChar; source: PWideChar; L: PtrInt): PAnsiChar;
|
|
var
|
|
c: PtrUInt;
|
|
d: byte;
|
|
lookupper: PByteArray; // better x86-64 / PIC asm generation
|
|
begin
|
|
if L > 0 then
|
|
begin
|
|
if L > 250 then
|
|
L := 250; // avoid buffer overflow
|
|
lookupper := @NormToUpperAnsi7Byte;
|
|
repeat
|
|
c := PWord(source)^;
|
|
d := ord('?');
|
|
if c < 255 then
|
|
d := lookupper[c];
|
|
dest^ := AnsiChar(d);
|
|
inc(dest);
|
|
inc(source);
|
|
dec(L);
|
|
until L = 0;
|
|
end;
|
|
result := dest;
|
|
end;
|
|
|
|
function UpperCopy(dest: PAnsiChar; const source: RawUtf8): PAnsiChar;
|
|
var
|
|
s: PAnsiChar;
|
|
c: byte;
|
|
lookupper: PByteArray; // better x86-64 / PIC asm generation
|
|
begin
|
|
s := pointer(source);
|
|
if s <> nil then
|
|
begin
|
|
lookupper := @NormToUpperAnsi7Byte;
|
|
repeat
|
|
c := lookupper[ord(s^)];
|
|
if c = 0 then
|
|
break;
|
|
dest^ := AnsiChar(c);
|
|
inc(s);
|
|
inc(dest);
|
|
until false;
|
|
end;
|
|
result := dest;
|
|
end;
|
|
|
|
function UpperCopyShort(dest: PAnsiChar; const source: ShortString): PAnsiChar;
|
|
var
|
|
s: PByteArray;
|
|
i: PtrInt;
|
|
lookupper: PByteArray; // better x86-64 / PIC asm generation
|
|
begin
|
|
s := @source;
|
|
lookupper := @NormToUpperAnsi7Byte;
|
|
for i := 1 to s[0] do
|
|
begin
|
|
dest^ := AnsiChar(lookupper[s[i]]);
|
|
inc(dest);
|
|
end;
|
|
result := dest;
|
|
end;
|
|
|
|
function UpperCaseUnicode(const S: RawUtf8): RawUtf8;
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
len: integer;
|
|
begin
|
|
if S = '' then
|
|
begin
|
|
result := '';
|
|
exit;
|
|
end;
|
|
tmp.Init(length(s) * 2);
|
|
len := Utf8ToWideChar(tmp.buf, pointer(S), length(S)) shr 1;
|
|
RawUnicodeToUtf8(tmp.buf, Unicode_InPlaceUpper(tmp.buf, len), result);
|
|
tmp.Done;
|
|
end;
|
|
|
|
function UpperCaseSynUnicode(const S: SynUnicode): SynUnicode;
|
|
begin
|
|
{$ifdef UNICODE}
|
|
result := SysUtils.UpperCase(S);
|
|
{$else}
|
|
{$ifdef HASVARUSTRING}
|
|
result := UnicodeUpperCase(S);
|
|
{$else}
|
|
result := WideUpperCase(s);
|
|
{$endif HASVARUSTRING}
|
|
{$endif UNICODE}
|
|
end;
|
|
|
|
function LowerCaseSynUnicode(const S: SynUnicode): SynUnicode;
|
|
begin
|
|
{$ifdef UNICODE}
|
|
result := SysUtils.LowerCase(S);
|
|
{$else}
|
|
{$ifdef HASVARUSTRING}
|
|
result := UnicodeLowerCase(S);
|
|
{$else}
|
|
result := WideLowerCase(s);
|
|
{$endif HASVARUSTRING}
|
|
{$endif UNICODE}
|
|
end;
|
|
|
|
function LowerCaseUnicode(const S: RawUtf8): RawUtf8;
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
len: integer;
|
|
begin
|
|
if S = '' then
|
|
begin
|
|
result := '';
|
|
exit;
|
|
end;
|
|
tmp.Init(length(s) * 2);
|
|
len := Utf8ToWideChar(tmp.buf, pointer(S), length(S)) shr 1;
|
|
RawUnicodeToUtf8(tmp.buf, Unicode_InPlaceLower(tmp.buf, len),result);
|
|
tmp.Done;
|
|
end;
|
|
|
|
function IsCaseSensitive(const S: RawUtf8): boolean;
|
|
begin
|
|
result := IsCaseSensitive(pointer(S), length(S));
|
|
end;
|
|
|
|
function IsCaseSensitive(P: PUtf8Char; PLen: PtrInt): boolean;
|
|
begin
|
|
result := true;
|
|
if (P <> nil) and
|
|
(PLen > 0) then
|
|
repeat
|
|
if ord(P^) in [ord('a')..ord('z'), ord('A')..ord('Z')] then
|
|
exit;
|
|
inc(P);
|
|
dec(PLen);
|
|
until PLen = 0;
|
|
result := false;
|
|
end;
|
|
|
|
procedure CaseCopy(Text: PUtf8Char; Len: PtrInt; Table: PNormTable;
|
|
var Dest: RawUtf8);
|
|
var
|
|
i: PtrInt;
|
|
tmp: PAnsiChar;
|
|
begin
|
|
tmp := FastNewString(Len, CP_UTF8);
|
|
for i := 0 to Len - 1 do
|
|
tmp[i] := Table[Text[i]]; // branchless conversion
|
|
FastAssignNew(Dest, tmp);
|
|
end;
|
|
|
|
procedure CaseSelf(var S: RawUtf8; Table: PNormTable);
|
|
var
|
|
i: PtrInt;
|
|
P: PUtf8Char;
|
|
begin
|
|
P := UniqueRawUtf8(S);
|
|
for i := 0 to length(S) - 1 do
|
|
P[i] := Table[P[i]]; // branchless conversion
|
|
end;
|
|
|
|
function UpperCase(const S: RawUtf8): RawUtf8;
|
|
begin
|
|
CaseCopy(pointer(S), length(S), @NormToUpperAnsi7, result);
|
|
end;
|
|
|
|
procedure UpperCaseCopy(Text: PUtf8Char; Len: PtrInt; var Dest: RawUtf8);
|
|
begin
|
|
CaseCopy(Text, Len, @NormToUpperAnsi7, Dest);
|
|
end;
|
|
|
|
procedure UpperCaseCopy(const Source: RawUtf8; var Dest: RawUtf8);
|
|
begin
|
|
CaseCopy(pointer(Source), length(Source), @NormToUpperAnsi7, Dest);
|
|
end;
|
|
|
|
procedure UpperCaseSelf(var S: RawUtf8);
|
|
begin
|
|
CaseSelf(S, @NormToUpperAnsi7);
|
|
end;
|
|
|
|
function LowerCase(const S: RawUtf8): RawUtf8;
|
|
begin
|
|
CaseCopy(pointer(S), length(S), @NormToLowerAnsi7, result);
|
|
end;
|
|
|
|
procedure LowerCaseCopy(Text: PUtf8Char; Len: PtrInt; var Dest: RawUtf8);
|
|
begin
|
|
CaseCopy(Text, Len, @NormToLowerAnsi7, Dest);
|
|
end;
|
|
|
|
procedure LowerCaseSelf(var S: RawUtf8);
|
|
begin
|
|
CaseSelf(S, @NormToLowerAnsi7);
|
|
end;
|
|
|
|
function PosExIPas(pSub, p: PUtf8Char; Offset: PtrUInt;
|
|
Lookup: PNormTable): PtrInt;
|
|
var
|
|
len, lenSub: PtrInt;
|
|
ch: AnsiChar;
|
|
pStart, pStop: PUtf8Char;
|
|
label
|
|
s2, s6, tt, t0, t1, t2, t3, t4, s0, s1, fnd, quit;
|
|
begin
|
|
result := 0;
|
|
if (p = nil) or
|
|
(pSub = nil) or
|
|
(PtrInt(Offset) <= 0) or
|
|
(Lookup = nil) then
|
|
goto quit;
|
|
len := PStrLen(p - _STRLEN)^;
|
|
lenSub := PStrLen(pSub - _STRLEN)^ - 1;
|
|
if (len < lenSub + PtrInt(Offset)) or
|
|
(lenSub < 0) then
|
|
goto quit;
|
|
pStop := p + len;
|
|
inc(p, lenSub);
|
|
inc(pSub, lenSub);
|
|
pStart := p;
|
|
p := @p[Offset + 3];
|
|
ch := Lookup[pSub[0]];
|
|
lenSub := -lenSub;
|
|
if p < pStop then
|
|
goto s6;
|
|
dec(p, 4);
|
|
goto s2;
|
|
s6: // check 6 chars per loop iteration with O(1) case comparison
|
|
if ch = Lookup[p[-4]] then
|
|
goto t4;
|
|
if ch = Lookup[p[-3]] then
|
|
goto t3;
|
|
if ch = Lookup[p[-2]] then
|
|
goto t2;
|
|
if ch = Lookup[p[-1]] then
|
|
goto t1;
|
|
s2:if ch = Lookup[p[0]] then
|
|
goto t0;
|
|
s1:if ch = Lookup[p[1]] then
|
|
goto tt;
|
|
s0:inc(p, 6);
|
|
if p < pStop then
|
|
goto s6;
|
|
dec(p, 4);
|
|
if p >= pStop then
|
|
goto quit;
|
|
goto s2;
|
|
t4:dec(p, 2);
|
|
t2:dec(p, 2);
|
|
goto t0;
|
|
t3:dec(p, 2);
|
|
t1:dec(p, 2);
|
|
tt:len := lenSub;
|
|
if lenSub <> 0 then
|
|
repeat
|
|
if (Lookup[pSub[len]] <> Lookup[p[len + 1]]) or
|
|
(Lookup[pSub[len + 1]] <> Lookup[p[len + 2]]) then
|
|
goto s0;
|
|
inc(len, 2);
|
|
until len >= 0;
|
|
inc(p, 2);
|
|
if p <= pStop then
|
|
goto fnd;
|
|
goto quit;
|
|
t0:len := lenSub;
|
|
if lenSub <> 0 then
|
|
repeat
|
|
if (Lookup[pSub[len]] <> Lookup[p[len]]) or
|
|
(Lookup[pSub[len + 1]] <> Lookup[p[len + 1]]) then
|
|
goto s1;
|
|
inc(len, 2);
|
|
until len >= 0;
|
|
inc(p);
|
|
fnd:
|
|
result := p - pStart;
|
|
quit:
|
|
end;
|
|
|
|
function PosExI(const SubStr, S: RawUtf8; Offset: PtrUInt): PtrInt;
|
|
begin
|
|
result := PosExIPas(pointer(SubStr), pointer(S), Offset, @NormToUpperAnsi7);
|
|
end;
|
|
|
|
function PosExI(const SubStr, S: RawUtf8; Offset: PtrUInt;
|
|
Lookup: PNormTable): PtrInt;
|
|
begin
|
|
if Lookup = nil then
|
|
{$ifdef CPUX86}
|
|
result := PosEx(SubStr, S, Offset)
|
|
{$else}
|
|
result := PosExPas(pointer(SubStr), pointer(S), Offset)
|
|
{$endif CPUX86}
|
|
else
|
|
result := PosExIPas(pointer(SubStr), pointer(S), Offset, Lookup);
|
|
end;
|
|
|
|
|
|
{ ************ UTF-8 String Manipulation Functions }
|
|
|
|
function StartWithExact(const text, textStart: RawUtf8): boolean;
|
|
var
|
|
l: PtrInt;
|
|
begin
|
|
l := length(textStart);
|
|
result := (length(text) >= l) and
|
|
CompareMem(pointer(text), pointer(textStart), l);
|
|
end;
|
|
|
|
function EndWithExact(const text, textEnd: RawUtf8): boolean;
|
|
var
|
|
l, o: PtrInt;
|
|
begin
|
|
l := length(textEnd);
|
|
o := length(text) - l;
|
|
result := (o >= 0) and
|
|
CompareMem(PUtf8Char(pointer(text)) + o, pointer(textEnd), l);
|
|
end;
|
|
|
|
function GetNextLine(source: PUtf8Char; out next: PUtf8Char; andtrim: boolean): RawUtf8;
|
|
var
|
|
beg: PUtf8Char;
|
|
begin
|
|
if source = nil then
|
|
begin
|
|
{$ifdef FPC}
|
|
FastAssignNew(result);
|
|
{$else}
|
|
result := '';
|
|
{$endif FPC}
|
|
next := source;
|
|
exit;
|
|
end;
|
|
if andtrim then // optional trim left
|
|
while source^ in [#9, ' '] do
|
|
inc(source);
|
|
beg := source;
|
|
repeat // just here to avoid a goto
|
|
if source[0] > #13 then
|
|
if source[1] > #13 then
|
|
if source[2] > #13 then
|
|
if source[3] > #13 then
|
|
begin
|
|
inc(source, 4); // fast process 4 chars per loop
|
|
continue;
|
|
end
|
|
else
|
|
inc(source, 3)
|
|
else
|
|
inc(source, 2)
|
|
else
|
|
inc(source);
|
|
case source^ of
|
|
#0:
|
|
next := nil;
|
|
#10:
|
|
next := source + 1;
|
|
#13:
|
|
if source[1] = #10 then
|
|
next := source + 2
|
|
else
|
|
next := source + 1;
|
|
else
|
|
begin
|
|
inc(source);
|
|
continue;
|
|
end;
|
|
end;
|
|
if andtrim then // optional trim right
|
|
while (source > beg) and
|
|
(source[-1] in [#9, ' ']) do
|
|
dec(source);
|
|
FastSetString(result, beg, source - beg);
|
|
exit;
|
|
until false;
|
|
end;
|
|
|
|
function LeftU(const S: RawUtf8; n: PtrInt): RawUtf8;
|
|
begin
|
|
result := Copy(S, 1, n);
|
|
end;
|
|
|
|
function RightU(const S: RawUtf8; n: PtrInt): RawUtf8;
|
|
var
|
|
L: PtrInt;
|
|
begin
|
|
L := length(S);
|
|
if n > L then
|
|
n := L;
|
|
result := Copy(S, L + 1 - n, n);
|
|
end;
|
|
|
|
function TrimLeft(const S: RawUtf8): RawUtf8;
|
|
var
|
|
i, l: PtrInt;
|
|
begin
|
|
l := Length(S);
|
|
i := 1;
|
|
while (i <= l) and
|
|
(S[i] <= ' ') do
|
|
inc(i);
|
|
if i = 1 then
|
|
result := S
|
|
else
|
|
FastSetString(result, @PByteArray(S)[i - 1], l - i);
|
|
end;
|
|
|
|
function TrimRight(const S: RawUtf8): RawUtf8;
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
i := Length(S);
|
|
while (i > 0) and
|
|
(S[i] <= ' ') do
|
|
dec(i);
|
|
FastSetString(result, pointer(S), i);
|
|
end;
|
|
|
|
procedure TrimLeftLines(var S: RawUtf8);
|
|
var
|
|
P, D: PUtf8Char;
|
|
begin
|
|
if S = '' then
|
|
exit;
|
|
P := UniqueRawUtf8(S);
|
|
D := P; // in-place process
|
|
repeat
|
|
while (P^ <= ' ') and
|
|
(P^ <> #0) do
|
|
inc(P);
|
|
while not (P^ in [#0, #10, #13]) do
|
|
begin
|
|
D^ := P^;
|
|
inc(P);
|
|
inc(D);
|
|
end;
|
|
if P^ = #0 then
|
|
break;
|
|
D^ := #10;
|
|
inc(D);
|
|
until false;
|
|
if D = pointer(S) then
|
|
S := ''
|
|
else
|
|
FakeLength(S, D); // no SetLength needed
|
|
end;
|
|
|
|
procedure TrimChars(var S: RawUtf8; Left, Right: PtrInt);
|
|
var
|
|
P: PUtf8Char;
|
|
begin
|
|
P := pointer(S);
|
|
if P = nil then
|
|
exit;
|
|
if Left < 0 then
|
|
Left := 0;
|
|
if Right < 0 then
|
|
Right := 0;
|
|
inc(Right, Left);
|
|
if Right = 0 then
|
|
exit; // nothing to trim
|
|
Right := PStrLen(P - _STRLEN)^ - Right; // compute new length
|
|
if Right > 0 then
|
|
if PStrCnt(P - _STRCNT)^ = 1 then // RefCnt=1 ?
|
|
begin
|
|
PStrLen(P - _STRLEN)^ := Right; // we can modify it in-place
|
|
if Left <> 0 then
|
|
MoveFast(P[Left], P^, Right);
|
|
P[Right] := #0;
|
|
end
|
|
else
|
|
FastSetString(S, P + Left, Right) // create a new unique string
|
|
else
|
|
FastAssignNew(S);
|
|
end;
|
|
|
|
function SplitRight(const Str: RawUtf8; SepChar: AnsiChar; LeftStr: PRawUtf8): RawUtf8;
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
for i := length(Str) downto 1 do
|
|
if Str[i] = SepChar then
|
|
begin
|
|
FastSetString(result, @PByteArray(Str)[i], length(Str) - i);
|
|
if LeftStr <> nil then
|
|
FastSetString(LeftStr^, pointer(Str), i - 1);
|
|
exit;
|
|
end;
|
|
result := Str;
|
|
if LeftStr <> nil then
|
|
FastAssignNew(LeftStr^);
|
|
end;
|
|
|
|
function SplitRights(const Str, SepChar: RawUtf8): RawUtf8;
|
|
var
|
|
i, j, sep: PtrInt;
|
|
c: AnsiChar;
|
|
begin
|
|
sep := length(SepChar);
|
|
if sep > 0 then
|
|
if sep = 1 then
|
|
result := SplitRight(Str, SepChar[1])
|
|
else
|
|
begin
|
|
for i := length(Str) downto 1 do
|
|
begin
|
|
c := Str[i];
|
|
for j := 1 to sep do
|
|
if c = SepChar[j] then
|
|
begin
|
|
FastSetString(result, @PByteArray(Str)[i], length(Str) - i);
|
|
exit;
|
|
end;
|
|
end;
|
|
end;
|
|
result := Str;
|
|
end;
|
|
|
|
function Split(const Str, SepStr: RawUtf8; var LeftStr, RightStr: RawUtf8;
|
|
ToUpperCase: boolean): boolean;
|
|
var
|
|
i: PtrInt;
|
|
tmp: pointer; // may be called as Split(Str,SepStr,Str,RightStr)
|
|
begin
|
|
if length(SepStr) = 1 then
|
|
i := PosExChar(SepStr[1], Str) // may use SSE2 on i386/x86_64
|
|
else
|
|
i := PosEx(SepStr, Str);
|
|
if i = 0 then
|
|
begin
|
|
LeftStr := Str;
|
|
RightStr := '';
|
|
result := false;
|
|
end
|
|
else
|
|
begin
|
|
dec(i);
|
|
tmp := nil;
|
|
FastSetString(RawUtf8(tmp), pointer(Str), i);
|
|
inc(i, length(SepStr));
|
|
FastSetString(RightStr, @PByteArray(Str)[i], length(Str) - i);
|
|
FastAssignNew(LeftStr, tmp);
|
|
result := true;
|
|
end;
|
|
if ToUpperCase then
|
|
begin
|
|
UpperCaseSelf(LeftStr);
|
|
UpperCaseSelf(RightStr);
|
|
end;
|
|
end;
|
|
|
|
function Split(const Str, SepStr: RawUtf8; var LeftStr: RawUtf8;
|
|
ToUpperCase: boolean): RawUtf8;
|
|
begin
|
|
Split(Str, SepStr, LeftStr, result, ToUpperCase);
|
|
end;
|
|
|
|
function Split(const Str: RawUtf8; const SepStr: array of RawUtf8;
|
|
const DestPtr: array of PRawUtf8): PtrInt;
|
|
var
|
|
s, i, j: PtrInt;
|
|
P: pointer;
|
|
begin
|
|
j := 1;
|
|
result := 0;
|
|
s := 0;
|
|
if high(SepStr) >= 0 then
|
|
while result <= high(DestPtr) do
|
|
begin
|
|
P := @PByteArray(Str)[j - 1];
|
|
i := PosEx(SepStr[s], Str, j);
|
|
if i = 0 then
|
|
begin
|
|
if DestPtr[result] <> nil then
|
|
FastSetString(DestPtr[result]^, P, length(Str) - j + 1);
|
|
inc(result);
|
|
break;
|
|
end;
|
|
if DestPtr[result] <> nil then
|
|
FastSetString(DestPtr[result]^, P, i - j);
|
|
inc(result);
|
|
if s < high(SepStr) then
|
|
inc(s);
|
|
j := i + 1;
|
|
end;
|
|
for i := result to high(DestPtr) do
|
|
if DestPtr[i] <> nil then
|
|
FastAssignNew(DestPtr[i]^);
|
|
end;
|
|
|
|
function IsVoid(const text: RawUtf8): boolean;
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
result := false;
|
|
for i := 1 to length(text) do
|
|
if text[i] > ' ' then
|
|
exit;
|
|
result := true;
|
|
end;
|
|
|
|
function TrimControlChars(const text: RawUtf8): RawUtf8;
|
|
var
|
|
len, i, j, n: PtrInt;
|
|
P: PAnsiChar;
|
|
begin
|
|
len := length(text);
|
|
for i := 1 to len do
|
|
if text[i] <= ' ' then
|
|
begin
|
|
n := i - 1;
|
|
FastSetString(result, len);
|
|
P := pointer(result);
|
|
if n > 0 then
|
|
MoveFast(pointer(text)^, P^, n);
|
|
for j := i + 1 to len do
|
|
if text[j] > ' ' then
|
|
begin
|
|
P[n] := text[j];
|
|
inc(n);
|
|
end;
|
|
FakeSetLength(result, n);
|
|
exit;
|
|
end;
|
|
result := text; // no control char found
|
|
end;
|
|
|
|
function TrimChar(const text: RawUtf8; const exclude: TSynAnsicharSet): RawUtf8;
|
|
var
|
|
len, i, j, n: PtrInt;
|
|
P: PAnsiChar;
|
|
begin
|
|
len := length(text);
|
|
for i := 1 to len do
|
|
if text[i] in exclude then
|
|
begin
|
|
n := i - 1;
|
|
FastSetString(result, len - 1);
|
|
P := pointer(result);
|
|
if n > 0 then
|
|
MoveFast(pointer(text)^, P^, n);
|
|
for j := i + 1 to len do
|
|
if not (text[j] in exclude) then
|
|
begin
|
|
P[n] := text[j];
|
|
inc(n);
|
|
end;
|
|
FakeSetLength(result, n);
|
|
exit;
|
|
end;
|
|
result := text; // no exclude char found
|
|
end;
|
|
|
|
function TrimOneChar(const text: RawUtf8; exclude: AnsiChar): RawUtf8;
|
|
var
|
|
first, len, i: PtrInt;
|
|
c: AnsiChar;
|
|
P: PAnsiChar;
|
|
begin
|
|
len := length(text);
|
|
first := ByteScanIndex(pointer(text), len, ord(exclude));
|
|
if first < 0 then
|
|
begin
|
|
result := text; // no exclude char found
|
|
exit;
|
|
end;
|
|
FastSetString(result, len - 1);
|
|
P := pointer(result);
|
|
MoveFast(pointer(text)^, P^, first);
|
|
inc(P, first);
|
|
for i := first + 1 to len do
|
|
begin
|
|
c := text[i];
|
|
if c <> exclude then
|
|
begin
|
|
P^ := c;
|
|
inc(P);
|
|
end;
|
|
end;
|
|
FakeSetLength(result, P - pointer(result));
|
|
end;
|
|
|
|
function OnlyChar(const text: RawUtf8; const only: TSynAnsicharSet): RawUtf8;
|
|
var
|
|
i: PtrInt;
|
|
exclude: array[0..(SizeOf(only) shr POINTERSHR) - 1] of PtrInt;
|
|
begin // reverse bits in local stack copy before calling TrimChar()
|
|
for i := 0 to (SizeOf(only) shr POINTERSHR) - 1 do
|
|
exclude[i] := not PPtrIntArray(@only)[i];
|
|
result := TrimChar(text, TSynAnsicharSet(exclude));
|
|
end;
|
|
|
|
procedure FillZero(var secret: RawByteString);
|
|
begin
|
|
if secret <> '' then
|
|
with PStrRec(Pointer(PtrInt(secret) - _STRRECSIZE))^ do
|
|
if refCnt = 1 then // avoid GPF if const
|
|
FillCharFast(pointer(secret)^, length, 0);
|
|
FastAssignNew(secret); // dec refCnt
|
|
end;
|
|
|
|
procedure FillZero(var secret: RawUtf8);
|
|
begin
|
|
FillZero(RawByteString(secret));
|
|
end;
|
|
|
|
procedure FillZero(var secret: SpiUtf8);
|
|
begin
|
|
FillZero(RawByteString(secret));
|
|
end;
|
|
|
|
{$ifdef HASVARUSTRING}
|
|
procedure FillZero(var secret: UnicodeString);
|
|
begin
|
|
if secret <> '' then
|
|
with PStrRec(Pointer(PtrInt(secret) - _STRRECSIZE))^ do
|
|
if refCnt = 1 then // avoid GPF if const
|
|
FillCharFast(pointer(secret)^, length * SizeOf(WideChar), 0);
|
|
Finalize(secret); // dec refCnt
|
|
end;
|
|
{$endif HASVARUSTRING}
|
|
|
|
procedure FillZero(var secret: TBytes);
|
|
begin
|
|
if secret <> nil then
|
|
with PDynArrayRec(Pointer(PtrInt(secret) - _DARECSIZE))^ do
|
|
if refCnt = 1 then // avoid GPF if const
|
|
FillCharFast(pointer(secret)^, length, 0);
|
|
secret := nil; // dec refCnt
|
|
end;
|
|
|
|
function StringReplaceAllProcess(const S, OldPattern, NewPattern: RawUtf8;
|
|
found: integer; Lookup: PNormTable): RawUtf8;
|
|
var
|
|
i, last, oldlen, newlen, sharedlen: PtrInt;
|
|
posCount: integer;
|
|
pos: TIntegerDynArray;
|
|
src, dst: PAnsiChar;
|
|
begin
|
|
oldlen := length(OldPattern);
|
|
newlen := length(NewPattern);
|
|
SetLength(pos, 64);
|
|
pos[0] := found;
|
|
posCount := 1;
|
|
repeat
|
|
found := PosExI(OldPattern, S, found + oldlen, Lookup);
|
|
if found = 0 then
|
|
break;
|
|
AddInteger(pos, posCount, found);
|
|
until false;
|
|
FastSetString(result, Length(S) + (newlen - oldlen) * posCount);
|
|
last := 1;
|
|
src := pointer(S);
|
|
dst := pointer(result);
|
|
for i := 0 to posCount - 1 do
|
|
begin
|
|
sharedlen := pos[i] - last;
|
|
MoveFast(src^, dst^, sharedlen);
|
|
inc(src, sharedlen + oldlen);
|
|
inc(dst, sharedlen);
|
|
if newlen > 0 then
|
|
begin
|
|
MoveByOne(pointer(NewPattern), dst, newlen);
|
|
inc(dst, newlen);
|
|
end;
|
|
last := pos[i] + oldlen;
|
|
end;
|
|
MoveFast(src^, dst^, length(S) - last + 1);
|
|
end;
|
|
|
|
function StringReplaceAll(const S, OldPattern, NewPattern: RawUtf8;
|
|
Lookup: PNormTable): RawUtf8;
|
|
var
|
|
found: integer;
|
|
begin
|
|
if (S = '') or
|
|
(OldPattern = '') or
|
|
(OldPattern = NewPattern) then
|
|
result := S
|
|
else
|
|
begin
|
|
if (Lookup = nil) and
|
|
(length(OldPattern) = 1) then
|
|
found := ByteScanIndex(pointer(S), {%H-}PStrLen(PtrUInt(S) - _STRLEN)^,
|
|
byte(OldPattern[1])) + 1
|
|
else
|
|
found := PosExI(OldPattern, S, 1, Lookup); // handle Lookup=nil
|
|
if found = 0 then
|
|
result := S
|
|
else
|
|
result := StringReplaceAllProcess(S, OldPattern, NewPattern, found, Lookup);
|
|
end;
|
|
end;
|
|
|
|
function StringReplaceAll(const S, OldPattern, NewPattern: RawUtf8;
|
|
CaseInsensitive: boolean): RawUtf8;
|
|
begin
|
|
result := StringReplaceAll(S, OldPattern, NewPattern, NORM2CASE[CaseInsensitive]);
|
|
end;
|
|
|
|
function StringReplaceAll(const S: RawUtf8;
|
|
const OldNewPatternPairs: array of RawUtf8; CaseInsensitive: boolean): RawUtf8;
|
|
var
|
|
n, i: PtrInt;
|
|
tab: PNormTable;
|
|
begin
|
|
result := S;
|
|
n := high(OldNewPatternPairs);
|
|
if (n <= 0) or
|
|
(n and 1 <> 1) then
|
|
exit;
|
|
tab := NORM2CASE[CaseInsensitive];
|
|
for i := 0 to n shr 1 do
|
|
result := StringReplaceAll(result,
|
|
OldNewPatternPairs[i * 2], OldNewPatternPairs[i * 2 + 1], tab);
|
|
end;
|
|
|
|
function StringReplaceChars(const Source: RawUtf8; OldChar, NewChar: AnsiChar): RawUtf8;
|
|
var
|
|
i, j, n: PtrInt;
|
|
P: PAnsiChar;
|
|
begin
|
|
if (OldChar <> NewChar) and
|
|
(Source <> '') then
|
|
begin
|
|
n := length(Source);
|
|
i := ByteScanIndex(pointer(Source), n, ord(OldChar));
|
|
if i >= 0 then
|
|
begin
|
|
FastSetString(result, pointer(Source), n);
|
|
P := pointer(result);
|
|
for j := i to n - 1 do
|
|
if P[j] = OldChar then
|
|
P[j] := NewChar;
|
|
exit;
|
|
end;
|
|
end;
|
|
result := Source;
|
|
end;
|
|
|
|
function StringReplaceTabs(const Source, TabText: RawUtf8): RawUtf8;
|
|
|
|
procedure Process(S, D, T: PAnsiChar; TLen: integer);
|
|
begin
|
|
repeat
|
|
if S^ = #0 then
|
|
break
|
|
else if S^ <> #9 then
|
|
begin
|
|
D^ := S^;
|
|
inc(D);
|
|
inc(S);
|
|
end
|
|
else
|
|
begin
|
|
if TLen > 0 then
|
|
begin
|
|
MoveByOne(T, D, TLen);
|
|
inc(D, TLen);
|
|
end;
|
|
inc(S);
|
|
end;
|
|
until false;
|
|
end;
|
|
|
|
var
|
|
L, i, n, ttl: PtrInt;
|
|
begin
|
|
ttl := length(TabText);
|
|
L := Length(Source);
|
|
n := 0;
|
|
if ttl <> 0 then
|
|
for i := 1 to L do
|
|
if Source[i] = #9 then
|
|
inc(n);
|
|
if n = 0 then
|
|
begin
|
|
result := Source;
|
|
exit;
|
|
end;
|
|
FastSetString(result, L + n * pred(ttl));
|
|
Process(pointer(Source), pointer(result), pointer(TabText), ttl);
|
|
end;
|
|
|
|
function RawUtf8OfChar(Ch: AnsiChar; Count: integer): RawUtf8;
|
|
begin
|
|
if Count <= 0 then
|
|
FastAssignNew(result)
|
|
else
|
|
begin
|
|
FastSetString(result, Count);
|
|
FillCharFast(pointer(result)^, Count, byte(Ch));
|
|
end;
|
|
end;
|
|
|
|
function QuotedStr(const S: RawUtf8; Quote: AnsiChar): RawUtf8;
|
|
begin
|
|
QuotedStr(pointer(S), length(S), Quote, result);
|
|
end;
|
|
|
|
procedure QuotedStr(const S: RawUtf8; Quote: AnsiChar; var result: RawUtf8);
|
|
var
|
|
P: PUtf8Char;
|
|
tmp: pointer; // will hold a RawUtf8 with no try..finally exception block
|
|
begin
|
|
tmp := nil;
|
|
P := pointer(S);
|
|
if (P <> nil) and
|
|
(P = pointer(result)) then
|
|
begin
|
|
RawUtf8(tmp) := S; // make private ref-counted copy for QuotedStr(U,'"',U)
|
|
P := pointer(tmp);
|
|
end;
|
|
QuotedStr(P, length(S), Quote, result);
|
|
if tmp <> nil then
|
|
{$ifdef FPC}
|
|
FastAssignNew(tmp);
|
|
{$else}
|
|
RawUtf8(tmp) := '';
|
|
{$endif FPC}
|
|
end;
|
|
|
|
procedure QuotedStr(P: PUtf8Char; PLen: PtrInt; Quote: AnsiChar;
|
|
var result: RawUtf8);
|
|
var
|
|
i, quote1, nquote: PtrInt;
|
|
R: PUtf8Char;
|
|
c: AnsiChar;
|
|
begin
|
|
nquote := 0;
|
|
quote1 := ByteScanIndex(pointer(P), PLen, byte(Quote)); // asm if available
|
|
if quote1 >= 0 then
|
|
for i := quote1 to PLen - 1 do
|
|
if P[i] = Quote then
|
|
inc(nquote);
|
|
FastSetString(result, PLen + nquote + 2);
|
|
R := pointer(result);
|
|
R^ := Quote;
|
|
inc(R);
|
|
if nquote = 0 then
|
|
begin
|
|
MoveFast(P^, R^, PLen);
|
|
R[PLen] := Quote;
|
|
end
|
|
else
|
|
begin
|
|
MoveFast(P^, R^, quote1);
|
|
inc(R, quote1);
|
|
inc(PLen, PtrInt(PtrUInt(P))); // efficient use of registers on FPC
|
|
inc(quote1, PtrInt(PtrUInt(P)));
|
|
repeat
|
|
if quote1 = PLen then
|
|
break;
|
|
c := PAnsiChar(quote1)^;
|
|
inc(quote1);
|
|
R^ := c;
|
|
inc(R);
|
|
if c <> Quote then
|
|
continue;
|
|
R^ := c;
|
|
inc(R);
|
|
until false;
|
|
R^ := Quote;
|
|
end;
|
|
end;
|
|
|
|
function GotoEndOfQuotedString(P: PUtf8Char): PUtf8Char;
|
|
var
|
|
quote: AnsiChar;
|
|
begin
|
|
// P^='"' or P^='''' at function call
|
|
quote := P^;
|
|
inc(P);
|
|
repeat
|
|
if P^ = #0 then
|
|
break
|
|
else if P^ <> quote then
|
|
inc(P)
|
|
else if P[1] = quote then // allow double quotes inside string
|
|
inc(P, 2)
|
|
else
|
|
break; // end quote
|
|
until false;
|
|
result := P;
|
|
end; // P^='"' or P^=#0 at function return
|
|
|
|
function GotoNextNotSpace(P: PUtf8Char): PUtf8Char;
|
|
begin
|
|
{$ifdef FPC}
|
|
while (P^ <= ' ') and
|
|
(P^ <> #0) do
|
|
inc(P);
|
|
{$else}
|
|
if P^ in [#1..' '] then
|
|
repeat
|
|
inc(P)
|
|
until not (P^ in [#1..' ']);
|
|
{$endif FPC}
|
|
result := P;
|
|
end;
|
|
|
|
function GotoNextNotSpaceSameLine(P: PUtf8Char): PUtf8Char;
|
|
begin
|
|
while P^ in [#9, ' '] do
|
|
inc(P);
|
|
result := P;
|
|
end;
|
|
|
|
function GotoNextSpace(P: PUtf8Char): PUtf8Char;
|
|
begin
|
|
if P^ > ' ' then
|
|
repeat
|
|
inc(P)
|
|
until P^ <= ' ';
|
|
result := P;
|
|
end;
|
|
|
|
function NextNotSpaceCharIs(var P: PUtf8Char; ch: AnsiChar): boolean;
|
|
begin
|
|
while (P^ <= ' ') and
|
|
(P^ <> #0) do
|
|
inc(P);
|
|
if P^ = ch then
|
|
begin
|
|
inc(P);
|
|
result := true;
|
|
end
|
|
else
|
|
result := false;
|
|
end;
|
|
|
|
function GotoNextSqlIdentifier(P: PUtf8Char; tab: PTextCharSet): PUtf8Char;
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
begin
|
|
while tcCtrlNot0Comma in tab[P^] do // in [#1..' ', ';']
|
|
inc(P);
|
|
if PWord(P)^ = ord('/') + ord('*') shl 8 then
|
|
begin
|
|
// detect and ignore e.g. '/*nocache*/'
|
|
repeat
|
|
inc(P);
|
|
if PWord(P)^ = ord('*') + ord('/') shl 8 then
|
|
begin
|
|
inc(P, 2);
|
|
break;
|
|
end;
|
|
until P^ = #0;
|
|
while tcCtrlNot0Comma in tab[P^] do
|
|
inc(P);
|
|
end;
|
|
result := P;
|
|
end;
|
|
|
|
function GetNextFieldProp(var P: PUtf8Char; var Prop: RawUtf8): boolean;
|
|
var
|
|
B: PUtf8Char;
|
|
tab: PTextCharSet;
|
|
begin
|
|
tab := @TEXT_CHARS;
|
|
P := GotoNextSqlIdentifier(P, tab); // handle /*comment*/
|
|
B := P;
|
|
while tcIdentifier in tab[P^] do
|
|
inc(P); // go to end of ['_', '0'..'9', 'a'..'z', 'A'..'Z'] chars
|
|
FastSetString(Prop, B, P - B);
|
|
P := GotoNextSqlIdentifier(P, tab);
|
|
result := Prop <> '';
|
|
end;
|
|
|
|
function GetNextFieldPropSameLine(var P: PUtf8Char; var Prop: ShortString): boolean;
|
|
var
|
|
B: PUtf8Char;
|
|
tab: PTextCharSet;
|
|
begin
|
|
tab := @TEXT_CHARS;
|
|
while tcCtrlNotLF in tab[P^] do
|
|
inc(P); // ignore [#1..#9, #11, #12, #14..' ']
|
|
B := P;
|
|
while tcIdentifier in tab[P^] do
|
|
inc(P); // go to end of field name
|
|
SetString(Prop, PAnsiChar(B), P - B);
|
|
while tcCtrlNotLF in TEXT_CHARS[P^] do
|
|
inc(P);
|
|
result := Prop <> '';
|
|
end;
|
|
|
|
function UnQuoteSqlStringVar(P: PUtf8Char; out Value: RawUtf8): PUtf8Char;
|
|
var
|
|
quote: AnsiChar;
|
|
PBeg, PS: PUtf8Char;
|
|
internalquote: PtrInt;
|
|
begin
|
|
result := nil;
|
|
if P = nil then
|
|
exit;
|
|
quote := P^; // " or '
|
|
inc(P);
|
|
// compute unquoted string length
|
|
PBeg := P;
|
|
internalquote := 0;
|
|
P := PosChar(P, quote); // fast SSE2 search on x86_64
|
|
if P = nil then
|
|
exit; // we need at least an ending quote
|
|
while true do
|
|
if P^ = #0 then
|
|
exit // where is my quote?
|
|
else if P^ <> quote then
|
|
inc(P)
|
|
else if P[1] = quote then
|
|
begin
|
|
inc(P, 2); // allow double quotes inside string
|
|
inc(internalquote);
|
|
end
|
|
else
|
|
break; // end quote
|
|
// create unquoted string
|
|
if internalquote = 0 then
|
|
// no quote within
|
|
FastSetString(Value, PBeg, P - PBeg)
|
|
else
|
|
begin
|
|
// unescape internal quotes
|
|
pointer(Value) := FastNewString(P - PBeg - internalquote, CP_UTF8);
|
|
P := PBeg;
|
|
PS := pointer(Value);
|
|
repeat
|
|
if P[0] = quote then
|
|
if P[1] = quote then
|
|
// allow double quotes inside string
|
|
inc(P)
|
|
else
|
|
// end quote
|
|
break;
|
|
PS^ := P[0];
|
|
inc(PS);
|
|
inc(P);
|
|
until false;
|
|
end;
|
|
result := P + 1;
|
|
end;
|
|
|
|
function UnQuoteSqlString(const Value: RawUtf8): RawUtf8;
|
|
begin
|
|
UnQuoteSqlStringVar(pointer(Value), result);
|
|
end;
|
|
|
|
function UnQuotedSqlSymbolName(const ExternalDBSymbol: RawUtf8): RawUtf8;
|
|
begin
|
|
if (ExternalDBSymbol <> '') and
|
|
(ExternalDBSymbol[1] in ['[', '"', '''', '(']) then
|
|
// e.g. for ZDBC's GetFields()
|
|
result := copy(ExternalDBSymbol, 2, length(ExternalDBSymbol) - 2)
|
|
else
|
|
result := ExternalDBSymbol;
|
|
end;
|
|
|
|
function IdemPCharAndGetNextLine(var source: PUtf8Char; searchUp: PAnsiChar): boolean;
|
|
begin
|
|
if source = nil then
|
|
result := false
|
|
else
|
|
begin
|
|
result := IdemPChar(source, searchUp);
|
|
source := GotoNextLine(source);
|
|
end;
|
|
end;
|
|
|
|
function FindNameValue(P: PUtf8Char; UpperName: PAnsiChar): PUtf8Char;
|
|
var
|
|
table: PNormTable; // faster even on i386
|
|
u: PAnsiChar;
|
|
label
|
|
eof, eol;
|
|
begin
|
|
if (P = nil) or
|
|
(UpperName = nil) then
|
|
goto eof;
|
|
table := @NormToUpperAnsi7;
|
|
repeat
|
|
if table[P^] <> UpperName^ then // first character is likely not to match
|
|
repeat // quickly go to end of current line
|
|
repeat
|
|
eol: if P^ <= #13 then
|
|
break;
|
|
inc(P);
|
|
until false;
|
|
if (P^ = #13) or
|
|
(P^ = #10) then
|
|
begin
|
|
repeat
|
|
inc(P);
|
|
until (P^ <> #10) and
|
|
(P^ <> #13);
|
|
if P^ = #0 then
|
|
goto eof;
|
|
break; // handle next line
|
|
end
|
|
else if P^ <> #0 then
|
|
continue; // e.g. #9
|
|
eof: result := nil; // reached P^=#0 -> not found
|
|
exit;
|
|
until false
|
|
else
|
|
begin
|
|
// first char did match -> try other chars
|
|
inc(P);
|
|
u := UpperName + 1;
|
|
repeat
|
|
if u^ = #0 then
|
|
break
|
|
else if u^ <> table[P^] then
|
|
goto eol;
|
|
inc(P);
|
|
inc(u);
|
|
until false;
|
|
result := P; // if found, points just after UpperName
|
|
exit;
|
|
end;
|
|
until false;
|
|
end;
|
|
|
|
function FindNameValuePointer(NameValuePairs: PUtf8Char; UpperName: PAnsiChar;
|
|
out FoundLen: PtrInt; UpperNameSeparator: AnsiChar): PUtf8Char;
|
|
var
|
|
P: PUtf8Char;
|
|
L: PtrInt;
|
|
begin
|
|
P := FindNameValue(NameValuePairs, UpperName);
|
|
if P <> nil then
|
|
repeat
|
|
if UpperNameSeparator <> #0 then
|
|
if P^ = UpperNameSeparator then
|
|
inc(P) // e.g. THttpSocket.HeaderGetValue uses UpperNameSeparator=':'
|
|
else
|
|
break;
|
|
while P^ in [#9, ' '] do // trim left
|
|
inc(P);
|
|
L := 0;
|
|
while P[L] > #13 do // end of line/value
|
|
inc(L);
|
|
while P[L - 1] = ' ' do // trim right
|
|
dec(L);
|
|
FoundLen := L;
|
|
break;
|
|
until false;
|
|
result := P;
|
|
end;
|
|
|
|
function FindNameValue(const NameValuePairs: RawUtf8; UpperName: PAnsiChar;
|
|
var Value: RawUtf8; KeepNotFoundValue: boolean; UpperNameSeparator: AnsiChar): boolean;
|
|
var
|
|
P: PUtf8Char;
|
|
L: PtrInt;
|
|
begin
|
|
P := FindNameValuePointer(pointer(NameValuePairs), UpperName, L, UpperNameSeparator);
|
|
if P <> nil then
|
|
begin
|
|
FastSetString(Value, P, L);
|
|
result := true;
|
|
end
|
|
else
|
|
begin
|
|
if not KeepNotFoundValue then
|
|
{$ifdef FPC}
|
|
FastAssignNew(Value);
|
|
{$else}
|
|
Value := '';
|
|
{$endif FPC}
|
|
result := false;
|
|
end;
|
|
end;
|
|
|
|
function GetLineSize(P, PEnd: PUtf8Char): PtrUInt;
|
|
var
|
|
c: byte;
|
|
begin
|
|
{$ifdef CPUX64}
|
|
if PEnd <> nil then
|
|
begin
|
|
result := BufferLineLength(P, PEnd); // use branchless SSE2 on x86_64
|
|
exit;
|
|
end;
|
|
result := PtrUInt(P) - 1;
|
|
{$else}
|
|
result := PtrUInt(P) - 1;
|
|
if PEnd <> nil then
|
|
repeat // inlined BufferLineLength()
|
|
inc(result);
|
|
if PtrUInt(result) < PtrUInt(PEnd) then
|
|
begin
|
|
c := PByte(result)^;
|
|
if (c > 13) or
|
|
((c <> 10) and
|
|
(c <> 13)) then
|
|
continue;
|
|
end;
|
|
break;
|
|
until false
|
|
else
|
|
{$endif CPUX64}
|
|
repeat // inlined BufferLineLength() ending at #0 for PEnd=nil
|
|
inc(result);
|
|
c := PByte(result)^;
|
|
if (c > 13) or
|
|
((c <> 0) and (c <> 10) and (c <> 13)) then
|
|
continue;
|
|
break;
|
|
until false;
|
|
dec(result, PtrUInt(P)); // returns length
|
|
end;
|
|
|
|
function GetLineSizeSmallerThan(P, PEnd: PUtf8Char; aMinimalCount: integer): boolean;
|
|
begin
|
|
result := false;
|
|
if P <> nil then
|
|
while (P < PEnd) and
|
|
(P^ <> #10) and
|
|
(P^ <> #13) do
|
|
if aMinimalCount = 0 then
|
|
exit
|
|
else
|
|
begin
|
|
dec(aMinimalCount);
|
|
inc(P);
|
|
end;
|
|
result := true;
|
|
end;
|
|
|
|
{$ifndef PUREMORMOT2}
|
|
function GetNextStringLineToRawUnicode(var P: PChar): RawUnicode;
|
|
var
|
|
S: PChar;
|
|
begin
|
|
if P = nil then
|
|
result := ''
|
|
else
|
|
begin
|
|
S := P;
|
|
while S^ >= ' ' do
|
|
inc(S);
|
|
result := StringToRawUnicode(P, S - P);
|
|
while (S^ <> #0) and
|
|
(S^ < ' ') do
|
|
inc(S); // ignore e.g. #13 or #10
|
|
if S^ <> #0 then
|
|
P := S
|
|
else
|
|
P := nil;
|
|
end;
|
|
end;
|
|
{$endif PUREMORMOT2}
|
|
|
|
function TrimLeftLowerCase(const V: RawUtf8): PUtf8Char;
|
|
begin
|
|
result := Pointer(V);
|
|
if result <> nil then
|
|
begin
|
|
while result^ in ['a'..'z'] do
|
|
inc(result);
|
|
if result^ = #0 then
|
|
result := Pointer(V);
|
|
end;
|
|
end;
|
|
|
|
function TrimLeftLowerCaseToShort(V: PShortString): ShortString;
|
|
begin
|
|
TrimLeftLowerCaseToShort(V, result);
|
|
end;
|
|
|
|
procedure TrimLeftLowerCaseToShort(V: PShortString; out result: ShortString);
|
|
var
|
|
P: PAnsiChar;
|
|
L: integer;
|
|
begin
|
|
L := length(V^);
|
|
P := @V^[1];
|
|
while (L > 0) and
|
|
(P^ in ['a'..'z']) do
|
|
begin
|
|
inc(P);
|
|
dec(L);
|
|
end;
|
|
if L = 0 then
|
|
result := V^
|
|
else
|
|
SetString(result, P, L);
|
|
end;
|
|
|
|
function TrimLeftLowerCaseShort(V: PShortString): RawUtf8;
|
|
var
|
|
P: PAnsiChar;
|
|
L: integer;
|
|
begin
|
|
L := length(V^);
|
|
P := @V^[1];
|
|
while (L > 0) and
|
|
(P^ in ['a'..'z']) do
|
|
begin
|
|
inc(P);
|
|
dec(L);
|
|
end;
|
|
if L = 0 then
|
|
FastSetString(result, @V^[1], length(V^))
|
|
else
|
|
FastSetString(result, P, L);
|
|
end;
|
|
|
|
procedure AppendShortComma(text: PAnsiChar; len: PtrInt; var result: ShortString;
|
|
trimlowercase: boolean);
|
|
begin
|
|
if trimlowercase then
|
|
while text^ in ['a'..'z'] do
|
|
if len = 1 then
|
|
exit
|
|
else
|
|
begin
|
|
inc(text);
|
|
dec(len);
|
|
end;
|
|
if integer(ord(result[0])) + len >= 255 then
|
|
exit;
|
|
if len > 0 then
|
|
MoveByOne(text, @result[ord(result[0]) + 1], len);
|
|
inc(result[0], len + 1);
|
|
result[ord(result[0])] := ',';
|
|
end;
|
|
|
|
function IdemPropNameUSmallNotVoid(P1, P2, P1P2Len: PtrInt): boolean;
|
|
{$ifdef HASINLINE}inline;{$endif}
|
|
begin
|
|
inc(P1P2Len, P1);
|
|
dec(P2, P1);
|
|
repeat
|
|
result := (PByte(P1)^ xor ord(PAnsiChar(P1)[P2])) and $df = 0;
|
|
if not result then
|
|
exit;
|
|
inc(P1);
|
|
until P1 >= P1P2Len;
|
|
end;
|
|
|
|
function FindShortStringListExact(List: PShortString; MaxValue: integer;
|
|
aValue: PUtf8Char; aValueLen: PtrInt): integer;
|
|
var
|
|
PLen: PtrInt;
|
|
begin
|
|
if aValueLen <> 0 then
|
|
for result := 0 to MaxValue do
|
|
begin
|
|
PLen := PByte(List)^;
|
|
if (PLen = aValueLen) and
|
|
IdemPropNameUSmallNotVoid(PtrInt(@List^[1]), PtrInt(aValue), PLen) then
|
|
exit;
|
|
List := pointer(@PAnsiChar(PLen)[PtrUInt(List) + 1]); // next
|
|
end;
|
|
result := -1;
|
|
end;
|
|
|
|
function FindShortStringListTrimLowerCase(List: PShortString; MaxValue: integer;
|
|
aValue: PUtf8Char; aValueLen: PtrInt): integer;
|
|
var
|
|
PLen: PtrInt;
|
|
begin
|
|
if aValueLen <> 0 then
|
|
for result := 0 to MaxValue do
|
|
begin
|
|
PLen := ord(List^[0]);
|
|
inc(PUtf8Char(List));
|
|
repeat // trim lower case
|
|
if not (PUtf8Char(List)^ in ['a'..'z']) then
|
|
break;
|
|
inc(PUtf8Char(List));
|
|
dec(PLen);
|
|
until PLen = 0;
|
|
if (PLen = aValueLen) and
|
|
IdemPropNameUSmallNotVoid(PtrInt(aValue), PtrInt(List), PLen) then
|
|
exit;
|
|
inc(PUtf8Char(List), PLen); // next
|
|
end;
|
|
result := -1;
|
|
end;
|
|
|
|
function FindShortStringListTrimLowerCaseExact(List: PShortString;
|
|
MaxValue: integer; aValue: PUtf8Char; aValueLen: PtrInt): integer;
|
|
var
|
|
PLen: PtrInt;
|
|
begin
|
|
if aValueLen <> 0 then
|
|
for result := 0 to MaxValue do
|
|
begin
|
|
PLen := ord(List^[0]);
|
|
inc(PUtf8Char(List));
|
|
repeat
|
|
if not (PUtf8Char(List)^ in ['a'..'z']) then
|
|
break;
|
|
inc(PUtf8Char(List));
|
|
dec(PLen);
|
|
until PLen = 0;
|
|
if (PLen = aValueLen) and
|
|
CompareMemSmall(aValue, List, PLen) then
|
|
exit;
|
|
inc(PUtf8Char(List), PLen);
|
|
end;
|
|
result := -1;
|
|
end;
|
|
|
|
function UnCamelCase(const S: RawUtf8): RawUtf8;
|
|
var
|
|
tmp: TSynTempBuffer;
|
|
destlen: PtrInt;
|
|
begin
|
|
if S = '' then
|
|
result := ''
|
|
else
|
|
begin
|
|
destlen := UnCamelCase(tmp.Init(length(S) * 2), pointer(S));
|
|
tmp.Done(PAnsiChar(tmp.buf) + destlen, result);
|
|
end;
|
|
end;
|
|
|
|
function UnCamelCase(D, P: PUtf8Char): integer;
|
|
var
|
|
Space, SpaceBeg, DBeg: PUtf8Char;
|
|
CapitalCount: integer;
|
|
Number: boolean;
|
|
label
|
|
Next;
|
|
begin
|
|
DBeg := D;
|
|
if (D <> nil) and
|
|
(P <> nil) then
|
|
begin
|
|
// avoid GPF
|
|
Space := D;
|
|
SpaceBeg := D;
|
|
repeat
|
|
CapitalCount := 0;
|
|
Number := P^ in ['0'..'9'];
|
|
if Number then
|
|
repeat
|
|
inc(CapitalCount);
|
|
D^ := P^;
|
|
inc(P);
|
|
inc(D);
|
|
until not (P^ in ['0'..'9'])
|
|
else
|
|
repeat
|
|
inc(CapitalCount);
|
|
D^ := P^;
|
|
inc(P);
|
|
inc(D);
|
|
until not (P^ in ['A'..'Z']);
|
|
if P^ = #0 then
|
|
break; // no lowercase conversion of last fully uppercased word
|
|
if (CapitalCount > 1) and
|
|
not Number then
|
|
begin
|
|
dec(P);
|
|
dec(D);
|
|
end;
|
|
while P^ in ['a'..'z'] do
|
|
begin
|
|
D^ := P^;
|
|
inc(D);
|
|
inc(P);
|
|
end;
|
|
if P^ = '_' then
|
|
if P[1] = '_' then
|
|
begin
|
|
D^ := ':';
|
|
inc(P);
|
|
inc(D);
|
|
goto Next;
|
|
end
|
|
else
|
|
begin
|
|
PWord(D)^ := ord(' ') + ord('-') shl 8;
|
|
inc(D, 2);
|
|
Next: if Space = SpaceBeg then
|
|
SpaceBeg := D + 1;
|
|
inc(P);
|
|
Space := D + 1;
|
|
end
|
|
else
|
|
Space := D;
|
|
if P^ = #0 then
|
|
break;
|
|
D^ := ' ';
|
|
inc(D);
|
|
until false;
|
|
if Space > DBeg then
|
|
dec(Space);
|
|
while Space > SpaceBeg do
|
|
begin
|
|
if Space^ in ['A'..'Z'] then
|
|
if not (Space[1] in ['A'..'Z', ' ']) then
|
|
inc(Space^, 32); // lowercase conversion of not last fully uppercased word
|
|
dec(Space);
|
|
end;
|
|
end;
|
|
result := D - DBeg;
|
|
end;
|
|
|
|
procedure CamelCase(P: PAnsiChar; len: PtrInt; var s: RawUtf8; const isWord: TSynByteSet);
|
|
var
|
|
i: PtrInt;
|
|
d: PAnsiChar;
|
|
tmp: array[byte] of AnsiChar;
|
|
begin
|
|
if len > SizeOf(tmp) then
|
|
len := SizeOf(tmp);
|
|
for i := 0 to len - 1 do
|
|
if not (ord(P[i]) in isWord) then
|
|
begin
|
|
if i > 0 then
|
|
begin
|
|
MoveFast(P^, tmp, i);
|
|
inc(P, i);
|
|
dec(len, i);
|
|
end;
|
|
d := @tmp[i];
|
|
while len > 0 do
|
|
begin
|
|
while (len > 0) and
|
|
not (ord(P^) in isWord) do
|
|
begin
|
|
inc(P);
|
|
dec(len);
|
|
end;
|
|
if len = 0 then
|
|
break;
|
|
d^ := NormToUpperAnsi7[P^];
|
|
inc(d);
|
|
repeat
|
|
inc(P);
|
|
dec(len);
|
|
if not (ord(P^) in isWord) then
|
|
break;
|
|
d^ := P^;
|
|
inc(d);
|
|
until len = 0;
|
|
end;
|
|
P := @tmp;
|
|
len := d - tmp;
|
|
break;
|
|
end;
|
|
FastSetString(s, P, len);
|
|
end;
|
|
|
|
procedure CamelCase(const text: RawUtf8; var s: RawUtf8; const isWord: TSynByteSet);
|
|
begin
|
|
CamelCase(pointer(text), length(text), s, isWord);
|
|
end;
|
|
|
|
procedure GetCaptionFromPCharLen(P: PUtf8Char; out result: string);
|
|
var
|
|
Temp: array[byte] of AnsiChar;
|
|
begin
|
|
if P = nil then
|
|
exit;
|
|
{$ifdef UNICODE}
|
|
Utf8DecodeToUnicodeString(Temp, UnCamelCase(@Temp, P), result);
|
|
{$else}
|
|
SetString(result, PAnsiChar(@Temp), UnCamelCase(@Temp, P));
|
|
{$endif UNICODE}
|
|
if Assigned(LoadResStringTranslate) then
|
|
LoadResStringTranslate(result);
|
|
end;
|
|
|
|
|
|
{ ************ TRawUtf8DynArray Processing Functions }
|
|
|
|
function IsZero(const Values: TRawUtf8DynArray): boolean;
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
result := false;
|
|
for i := 0 to length(Values) - 1 do
|
|
if Values[i] <> '' then
|
|
exit;
|
|
result := true;
|
|
end;
|
|
|
|
function TRawUtf8DynArrayFrom(const Values: array of RawUtf8): TRawUtf8DynArray;
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
Finalize(result);
|
|
SetLength(result, length(Values));
|
|
for i := 0 to high(Values) do
|
|
result[i] := Values[i];
|
|
end;
|
|
|
|
function FindRawUtf8(Values: PRawUtf8; const Value: RawUtf8; ValuesCount: integer;
|
|
CaseSensitive: boolean): integer;
|
|
var
|
|
ValueLen: TStrLen;
|
|
begin
|
|
dec(ValuesCount);
|
|
ValueLen := length(Value);
|
|
if ValueLen = 0 then
|
|
for result := 0 to ValuesCount do
|
|
if Values^ = '' then
|
|
exit
|
|
else
|
|
inc(Values)
|
|
else if CaseSensitive then
|
|
for result := 0 to ValuesCount do
|
|
if (PtrUInt(Values^) <> 0) and
|
|
({%H-}PStrLen(PtrUInt(Values^) - _STRLEN)^ = ValueLen) and
|
|
CompareMemFixed(pointer(PtrInt(Values^)), pointer(Value), ValueLen) then
|
|
exit
|
|
else
|
|
inc(Values)
|
|
else
|
|
for result := 0 to ValuesCount do
|
|
if (PtrUInt(Values^) <> 0) and // StrIComp() won't change length
|
|
({%H-}PStrLen(PtrUInt(Values^) - _STRLEN)^ = ValueLen) and
|
|
(StrIComp(pointer(Values^), pointer(Value)) = 0) then
|
|
exit
|
|
else
|
|
inc(Values);
|
|
result := -1;
|
|
end;
|
|
|
|
function FindRawUtf8(const Values: TRawUtf8DynArray; const Value: RawUtf8;
|
|
CaseSensitive: boolean): integer;
|
|
begin
|
|
result := FindRawUtf8(pointer(Values), Value, length(Values), CaseSensitive);
|
|
end;
|
|
|
|
function FindRawUtf8(const Values: array of RawUtf8; const Value: RawUtf8;
|
|
CaseSensitive: boolean): integer;
|
|
begin
|
|
result := high(Values);
|
|
if result >= 0 then
|
|
result := FindRawUtf8(@Values[0], Value, result + 1, CaseSensitive);
|
|
end;
|
|
|
|
function AddRawUtf8(var Values: TRawUtf8DynArray; const Value: RawUtf8;
|
|
NoDuplicates, CaseSensitive: boolean): boolean;
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
if NoDuplicates then
|
|
begin
|
|
i := FindRawUtf8(Values, Value, CaseSensitive);
|
|
if i >= 0 then
|
|
begin
|
|
result := false;
|
|
exit;
|
|
end;
|
|
end;
|
|
i := length(Values);
|
|
SetLength(Values, i + 1);
|
|
Values[i] := Value;
|
|
result := true;
|
|
end;
|
|
|
|
function AddRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer;
|
|
const Value: RawUtf8): PtrInt;
|
|
begin
|
|
result := ValuesCount;
|
|
if result = Length(Values) then
|
|
SetLength(Values, NextGrow(result));
|
|
Values[result] := Value;
|
|
inc(ValuesCount);
|
|
end;
|
|
|
|
procedure AddRawUtf8(var Values: TRawUtf8DynArray; const Value: TRawUtf8DynArray);
|
|
var
|
|
n, o, i: PtrInt;
|
|
begin
|
|
n := length(Value);
|
|
if n = 0 then
|
|
exit;
|
|
o := length(Values);
|
|
SetLength(Values, o + n);
|
|
for i := 0 to n - 1 do
|
|
Values[o + i] := Value[i];
|
|
end;
|
|
|
|
procedure AddRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer;
|
|
const Value: TRawUtf8DynArray);
|
|
var
|
|
n, o, i: PtrInt;
|
|
begin
|
|
n := length(Value);
|
|
o := ValuesCount;
|
|
inc(ValuesCount, n);
|
|
if ValuesCount > Length(Values) then
|
|
SetLength(Values, NextGrow(ValuesCount));
|
|
for i := 0 to n - 1 do
|
|
Values[o + i] := Value[i];
|
|
end;
|
|
|
|
function RawUtf8DynArrayEquals(const A, B: TRawUtf8DynArray): boolean;
|
|
var
|
|
n, i: PtrInt;
|
|
begin
|
|
result := false;
|
|
n := length(A);
|
|
if n <> length(B) then
|
|
exit;
|
|
for i := 0 to n - 1 do
|
|
if A[i] <> B[i] then
|
|
exit;
|
|
result := true;
|
|
end;
|
|
|
|
function RawUtf8DynArrayEquals(const A, B: TRawUtf8DynArray; Count: integer): boolean;
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
result := false;
|
|
for i := 0 to Count - 1 do
|
|
if A[i] <> B[i] then
|
|
exit;
|
|
result := true;
|
|
end;
|
|
|
|
function AddString(var Values: TStringDynArray; const Value: string): PtrInt;
|
|
begin
|
|
result := length(Values);
|
|
SetLength(Values, result + 1);
|
|
Values[result] := Value;
|
|
end;
|
|
|
|
procedure StringDynArrayToRawUtf8DynArray(const Source: TStringDynArray;
|
|
var Result: TRawUtf8DynArray);
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
Finalize(Result);
|
|
SetLength(Result, length(Source));
|
|
for i := 0 to length(Source) - 1 do
|
|
StringToUtf8(Source[i], Result[i]);
|
|
end;
|
|
|
|
procedure StringListToRawUtf8DynArray(Source: TStringList; var Result: TRawUtf8DynArray);
|
|
var
|
|
i: PtrInt;
|
|
begin
|
|
Finalize(Result);
|
|
SetLength(Result, Source.Count);
|
|
for i := 0 to Source.Count - 1 do
|
|
StringToUtf8(Source[i], Result[i]);
|
|
end;
|
|
|
|
function FastLocatePUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char): PtrInt;
|
|
begin
|
|
result := FastLocatePUtf8CharSorted(P, R, Value, TUtf8Compare(@StrComp));
|
|
end;
|
|
|
|
function FastLocatePUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt;
|
|
Value: PUtf8Char; Compare: TUtf8Compare): PtrInt;
|
|
var
|
|
L, i, cmp: PtrInt;
|
|
begin
|
|
// fast O(log(n)) binary search
|
|
if (not Assigned(Compare)) or
|
|
(R < 0) then
|
|
result := 0
|
|
else if Compare(P^[R], Value) < 0 then // quick return if already sorted
|
|
result := R + 1
|
|
else
|
|
begin
|
|
L := 0;
|
|
result := -1; // return -1 if found
|
|
repeat
|
|
{$ifdef CPUX64}
|
|
i := L + R;
|
|
i := i shr 1;
|
|
{$else}
|
|
i := (L + R) shr 1;
|
|
{$endif CPUX64}
|
|
cmp := Compare(P^[i], Value);
|
|
if cmp = 0 then
|
|
exit;
|
|
if cmp < 0 then
|
|
L := i + 1
|
|
else
|
|
R := i - 1;
|
|
until L > R;
|
|
while (i >= 0) and (Compare(P^[i], Value) >= 0) do
|
|
dec(i);
|
|
result := i + 1; // return the index where to insert
|
|
end;
|
|
end;
|
|
|
|
function FastFindPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt;
|
|
Value: PUtf8Char; Compare: TUtf8Compare): PtrInt;
|
|
var
|
|
L, cmp: PtrInt;
|
|
begin
|
|
// fast O(log(n)) binary search
|
|
L := 0;
|
|
if Assigned(Compare) and (R >= 0) then
|
|
repeat
|
|
{$ifdef CPUX64}
|
|
result := L + R;
|
|
result := result shr 1;
|
|
{$else}
|
|
result := (L + R) shr 1;
|
|
{$endif CPUX64}
|
|
cmp := Compare(P^[result], Value);
|
|
if cmp = 0 then
|
|
exit;
|
|
if cmp < 0 then
|
|
begin
|
|
L := result + 1;
|
|
if L <= R then
|
|
continue;
|
|
break;
|
|
end;
|
|
R := result - 1;
|
|
if L <= R then
|
|
continue;
|
|
break;
|
|
until false;
|
|
result := -1;
|
|
end;
|
|
|
|
{$ifdef CPUX64}
|
|
|
|
function FastFindPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char): PtrInt;
|
|
{$ifdef FPC} assembler; nostackframe; asm {$else} asm .noframe {$endif}
|
|
{$ifdef win64} // P=rcx/rdi R=rdx/rsi Value=r8/rdx
|
|
push rdi
|
|
mov rdi, P // P=rdi
|
|
{$endif win64}
|
|
push r12
|
|
push r13
|
|
xor r9, r9 // L=r9
|
|
test R, R
|
|
jl @err
|
|
test Value, Value
|
|
jz @void
|
|
mov cl, byte ptr [Value] // to check first char (likely diverse)
|
|
@s: lea rax, qword ptr [r9 + R]
|
|
shr rax, 1
|
|
lea r12, qword ptr [rax - 1] // branchless main loop
|
|
lea r13, qword ptr [rax + 1]
|
|
mov r10, qword ptr [rdi + rax * 8]
|
|
test r10, r10
|
|
jz @lt
|
|
cmp cl, byte ptr [r10]
|
|
je @eq
|
|
cmovc R, r12
|
|
cmovnc r9, r13
|
|
@nxt: cmp r9, R
|
|
jle @s
|
|
@err: mov rax, -1
|
|
@found: pop r13
|
|
pop r12
|
|
{$ifdef win64}
|
|
pop rdi
|
|
{$endif win64}
|
|
ret
|
|
@lt: mov r9, r13 // very unlikely P[rax]=nil
|
|
jmp @nxt
|
|
@eq: mov r11, Value // first char equal -> check others
|
|
@sub: mov cl, byte ptr [r10]
|
|
add r10, 1
|
|
add r11, 1
|
|
test cl, cl
|
|
jz @found
|
|
mov cl, byte ptr [r11]
|
|
cmp cl, byte ptr [r10]
|
|
je @sub
|
|
mov cl, byte ptr [Value] // reset first char
|
|
cmovc R, r12
|
|
cmovnc r9, r13
|
|
cmp r9, R
|
|
jle @s
|
|
jmp @err
|
|
@void: mov rax, -1
|
|
cmp qword ptr [P], 0
|
|
cmove rax, Value
|
|
jmp @found
|
|
end;
|
|
|
|
{$else}
|
|
|
|
function FastFindPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt; Value: PUtf8Char): PtrInt;
|
|
var
|
|
L: PtrInt;
|
|
c: byte;
|
|
piv, val: PByte;
|
|
begin
|
|
// fast O(log(n)) binary search using inlined StrCompFast()
|
|
if R >= 0 then
|
|
if Value <> nil then
|
|
begin
|
|
L := 0;
|
|
repeat
|
|
result := L + R;
|
|
result := result shr 1;
|
|
piv := pointer(P^[result]);
|
|
if piv <> nil then
|
|
begin
|
|
val := pointer(Value);
|
|
c := piv^;
|
|
if c = val^ then
|
|
repeat
|
|
if c = 0 then
|
|
exit; // StrComp(P^[result],Value)=0
|
|
inc(piv);
|
|
inc(val);
|
|
c := piv^;
|
|
until c <> val^;
|
|
if c > val^ then
|
|
begin
|
|
R := result - 1; // StrComp(P^[result],Value)>0
|
|
if L <= R then
|
|
continue;
|
|
break;
|
|
end;
|
|
end;
|
|
L := result + 1; // StrComp(P^[result],Value)<0
|
|
if L <= R then
|
|
continue;
|
|
break;
|
|
until false;
|
|
end
|
|
else if P^[0] = nil then
|
|
begin
|
|
// '' should be in lowest P[] slot
|
|
result := 0;
|
|
exit;
|
|
end;
|
|
result := -1;
|
|
end;
|
|
|
|
{$endif CPUX64}
|
|
|
|
function FastFindUpperPUtf8CharSorted(P: PPUtf8CharArray; R: PtrInt;
|
|
Value: PUtf8Char; ValueLen: PtrInt): PtrInt;
|
|
var
|
|
tmp: array[byte] of AnsiChar;
|
|
begin
|
|
UpperCopy255Buf(@tmp, Value, ValueLen)^ := #0;
|
|
result := FastFindPUtf8CharSorted(P, R, @tmp);
|
|
end;
|
|
|
|
function FastFindIndexedPUtf8Char(P: PPUtf8CharArray; R: PtrInt;
|
|
var SortedIndexes: TCardinalDynArray; Value: PUtf8Char;
|
|
ItemComp: TUtf8Compare): PtrInt;
|
|
var
|
|
L, cmp: PtrInt;
|
|
begin
|
|
// fast O(log(n)) binary search
|
|
L := 0;
|
|
if 0 <= R then
|
|
repeat
|
|
{$ifdef CPUX64}
|
|
result := L + R;
|
|
result := result shr 1;
|
|
{$else}
|
|
result := (L + R) shr 1;
|
|
{$endif CPUX64}
|
|
cmp := ItemComp(P^[SortedIndexes[result]], Value);
|
|
if cmp = 0 then
|
|
begin
|
|
result := SortedIndexes[result];
|
|
exit;
|
|
end;
|
|
if cmp < 0 then
|
|
begin
|
|
L := result + 1;
|
|
if L <= R then
|
|
continue;
|
|
break;
|
|
end;
|
|
R := result - 1;
|
|
if L <= R then
|
|
continue;
|
|
break;
|
|
until false;
|
|
result := -1;
|
|
end;
|
|
|
|
function AddSortedRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer;
|
|
const Value: RawUtf8; CoValues: PIntegerDynArray; ForcedIndex: PtrInt;
|
|
Compare: TUtf8Compare): PtrInt;
|
|
var
|
|
n: PtrInt;
|
|
begin
|
|
if ForcedIndex >= 0 then
|
|
result := ForcedIndex
|
|
else
|
|
begin
|
|
if not Assigned(Compare) then
|
|
Compare := @StrComp;
|
|
result := FastLocatePUtf8CharSorted(pointer(Values), ValuesCount - 1,
|
|
pointer(Value), Compare);
|
|
if result < 0 then
|
|
exit; // Value exists -> fails
|
|
end;
|
|
n := Length(Values);
|
|
if ValuesCount = n then
|
|
begin
|
|
n := NextGrow(n);
|
|
SetLength(Values, n);
|
|
if CoValues <> nil then
|
|
SetLength(CoValues^, n);
|
|
end;
|
|
n := ValuesCount;
|
|
if result < n then
|
|
begin
|
|
n := (n - result) * SizeOf(pointer);
|
|
MoveFast(Pointer(Values[result]), Pointer(Values[result + 1]), n);
|
|
PtrInt(Values[result]) := 0; // avoid GPF
|
|
if CoValues <> nil then
|
|
begin
|
|
{$ifdef CPU64} n := n shr 1; {$endif} // 64-bit pointer to 32-bit integer
|
|
MoveFast(CoValues^[result], CoValues^[result + 1], n);
|
|
end;
|
|
end
|
|
else
|
|
result := n;
|
|
Values[result] := Value;
|
|
inc(ValuesCount);
|
|
end;
|
|
|
|
type
|
|
/// used internally for faster quick sort
|
|
{$ifdef USERECORDWITHMETHODS}
|
|
TQuickSortRawUtf8 = record
|
|
{$else}
|
|
TQuickSortRawUtf8 = object
|
|
{$endif USERECORDWITHMETHODS}
|
|
public
|
|
Compare: TUtf8Compare;
|
|
CoValues: PIntegerArray;
|
|
pivot: pointer;
|
|
procedure Sort(Values: PPointerArray; L, R: PtrInt);
|
|
end;
|
|
|
|
procedure TQuickSortRawUtf8.Sort(Values: PPointerArray; L, R: PtrInt);
|
|
var
|
|
I, J, P: PtrInt;
|
|
tmp: Pointer;
|
|
int: integer;
|
|
begin
|
|
if L < R then
|
|
repeat
|
|
I := L;
|
|
J := R;
|
|
P := (L + R) shr 1;
|
|
repeat
|
|
pivot := Values^[P];
|
|
while Compare(Values^[I], pivot) < 0 do
|
|
inc(I);
|
|
while Compare(Values^[J], pivot) > 0 do
|
|
dec(J);
|
|
if I <= J then
|
|
begin
|
|
tmp := Values^[J];
|
|
Values^[J] := Values^[I];
|
|
Values^[I] := tmp;
|
|
if CoValues <> nil then
|
|
begin
|
|
int := CoValues^[J];
|
|
CoValues^[J] := CoValues^[I];
|
|
CoValues^[I] := int;
|
|
end;
|
|
if P = I then
|
|
P := J
|
|
else if P = J then
|
|
P := I;
|
|
inc(I);
|
|
dec(J);
|
|
end;
|
|
until I > J;
|
|
if J - L < R - I then
|
|
begin
|
|
// use recursion only for smaller range
|
|
if L < J then
|
|
Sort(Values, L, J);
|
|
L := I;
|
|
end
|
|
else
|
|
begin
|
|
if I < R then
|
|
Sort(Values, I, R);
|
|
R := J;
|
|
end;
|
|
until L >= R;
|
|
end;
|
|
|
|
procedure QuickSortRawUtf8(var Values: TRawUtf8DynArray; ValuesCount: integer;
|
|
CoValues: PIntegerDynArray; Compare: TUtf8Compare);
|
|
var
|
|
QS: TQuickSortRawUtf8;
|
|
begin
|
|
if Assigned(Compare) then
|
|
QS.Compare := Compare
|
|
else
|
|
QS.Compare := @StrComp;
|
|
if CoValues = nil then
|
|
QS.CoValues := nil
|
|
else
|
|
QS.CoValues := pointer(CoValues^);
|
|
QS.Sort(pointer(Values), 0, ValuesCount - 1);
|
|
end;
|
|
|
|
procedure QuickSortRawUtf8(Values: PRawUtf8Array; L, R: PtrInt;
|
|
caseInsensitive: boolean);
|
|
var
|
|
QS: TQuickSortRawUtf8;
|
|
begin
|
|
QS.Compare := StrCompByCase[caseInsensitive];
|
|
QS.CoValues := nil;
|
|
QS.Sort(pointer(Values), L, R);
|
|
end;
|
|
|
|
procedure MakeUniqueArray(var Values: TRawUtf8DynArray);
|
|
begin
|
|
Values := copy(Values); // sub-proc to avoid try..finally
|
|
end;
|
|
|
|
function DeleteRawUtf8(var Values: TRawUtf8DynArray; Index: PtrInt): boolean;
|
|
var
|
|
n: PtrInt;
|
|
begin
|
|
n := length(Values);
|
|
if PtrUInt(Index) >= PtrUInt(n) then
|
|
result := false
|
|
else
|
|
begin
|
|
dec(n);
|
|
if PDACnt(PAnsiChar(pointer(Values)) - _DACNT)^ > 1 then
|
|
MakeUniqueArray(Values);
|
|
Values[Index] := ''; // avoid GPF
|
|
if n > Index then
|
|
begin
|
|
MoveFast(pointer(Values[Index + 1]), pointer(Values[Index]),
|
|
(n - Index) * SizeOf(pointer));
|
|
PtrUInt(Values[n]) := 0; // avoid GPF
|
|
end;
|
|
SetLength(Values, n);
|
|
result := true;
|
|
end;
|
|
end;
|
|
|
|
function DeleteRawUtf8(var Values: TRawUtf8DynArray; var ValuesCount: integer;
|
|
Index: integer; CoValues: PIntegerDynArray): boolean;
|
|
var
|
|
n: integer;
|
|
begin
|
|
n := ValuesCount;
|
|
if cardinal(Index) >= cardinal(n) then
|
|
result := false
|
|
else
|
|
begin
|
|
dec(n);
|
|
ValuesCount := n;
|
|
if PDACnt(PAnsiChar(pointer(Values)) - _DACNT)^ > 1 then
|
|
MakeUniqueArray(Values);
|
|
Values[Index] := ''; // avoid GPF
|
|
dec(n, Index);
|
|
if n > 0 then
|
|
begin
|
|
if CoValues <> nil then
|
|
MoveFast(CoValues^[Index + 1], CoValues^[Index], n * SizeOf(integer));
|
|
MoveFast(pointer(Values[Index + 1]), pointer(Values[Index]), n * SizeOf(pointer));
|
|
PtrUInt(Values[ValuesCount]) := 0; // avoid GPF
|
|
end;
|
|
result := true;
|
|
end;
|
|
end;
|
|
|
|
{$ifdef OSPOSIX}
|
|
|
|
{ TPosixFileCaseInsensitive }
|
|
|
|
constructor TPosixFileCaseInsensitive.Create(
|
|
const aFolder: TFileName; aSubFolders: boolean);
|
|
begin
|
|
fFolder := aFolder;
|
|
fSubFolders := aSubFolders;
|
|
fFlushSeconds := 60;
|
|
end;
|
|
|
|
procedure TPosixFileCaseInsensitive.SetFolder(const aFolder: TFileName);
|
|
begin
|
|
if self = nil then
|
|
exit;
|
|
fSafe.WriteLock;
|
|
try
|
|
fFiles := nil; // force list refresh
|
|
fFolder := aFolder;
|
|
finally
|
|
fSafe.WriteUnLock;
|
|
end;
|
|
end;
|
|
|
|
procedure TPosixFileCaseInsensitive.SetSubFolders(aSubFolders: boolean);
|
|
begin
|
|
if (self = nil) or
|
|
(fSubFolders = aSubFolders) then
|
|
exit;
|
|
fSubFolders := aSubFolders;
|
|
Flush;
|
|
end;
|
|
|
|
procedure TPosixFileCaseInsensitive.OnIdle(tix64: Int64);
|
|
begin
|
|
if (self = nil) or
|
|
(fFiles = nil) or
|
|
(fFlushSeconds = 0) then
|
|
exit;
|
|
if tix64 = 0 then
|
|
tix64 := GetTickCount64;
|
|
if tix64 shr 10 < fNextTix then
|
|
exit;
|
|
fSafe.WriteLock;
|
|
try
|
|
fFiles := nil; // force list refresh
|
|
finally
|
|
fSafe.WriteUnLock;
|
|
end;
|
|
end;
|
|
|
|
procedure TPosixFileCaseInsensitive.Flush;
|
|
begin
|
|
SetFolder(fFolder);
|
|
end;
|
|
|
|
function TPosixFileCaseInsensitive.Find(const aSearched: TFileName;
|
|
aReadMs: PInteger): TFileName;
|
|
var
|
|
start, stop: Int64;
|
|
i: PtrInt;
|
|
fn: RawUtf8;
|
|
begin
|
|
result := '';
|
|
if aReadMs <> nil then
|
|
aReadMs^ := 0;
|
|
if (self = nil) or
|
|
(fFolder = '') or
|
|
(aSearched = '') then
|
|
exit;
|
|
if fFiles = nil then // need to refresh the cache
|
|
begin
|
|
fSafe.WriteLock;
|
|
try
|
|
if fFiles = nil then
|
|
begin
|
|
if aReadMs <> nil then
|
|
QueryPerformanceMicroSeconds(start);
|
|
fFiles := PosixFileNames(fFolder, fSubFolders); // fast syscall
|
|
QuickSortRawUtf8(fFiles, length(fFiles), nil, @StrIComp);
|
|
// e.g. 4392 filenames from /home/ab/dev/lib/ in 7.20ms
|
|
if aReadMs <> nil then
|
|
begin
|
|
QueryPerformanceMicroSeconds(stop);
|
|
aReadMs^ := stop - start;
|
|
end;
|
|
if fFlushSeconds <> 0 then
|
|
fNextTix := (GetTickCount64 shr 10) + fFlushSeconds;
|
|
end;
|
|
finally
|
|
fSafe.WriteUnLock;
|
|
end;
|
|
end;
|
|
StringToUtf8(aSearched, fn);
|
|
fSafe.ReadLock; // non-blocking lookup
|
|
try
|
|
i := FastFindPUtf8CharSorted( // efficient O(log(n)) binary search
|
|
pointer(fFiles), high(fFiles), pointer(fn), @StrIComp);
|
|
if i >= 0 then
|
|
Utf8ToFileName(fFiles[i], result); // use exact file name case from OS
|
|
finally
|
|
fSafe.ReadUnLock;
|
|
end;
|
|
end;
|
|
|
|
function TPosixFileCaseInsensitive.Count: PtrInt;
|
|
begin
|
|
if self = nil then
|
|
result := 0
|
|
else
|
|
result := length(fFiles);
|
|
end;
|
|
|
|
function TPosixFileCaseInsensitive.Files: TRawUtf8DynArray;
|
|
begin
|
|
result := nil;
|
|
if (self = nil) or
|
|
(fFiles = nil) then
|
|
exit;
|
|
fSafe.ReadLock;
|
|
try
|
|
result := copy(fFiles); // make a copy for thread safety
|
|
finally
|
|
fSafe.ReadUnLock;
|
|
end;
|
|
end;
|
|
|
|
{$endif OSPOSIX}
|
|
|
|
|
|
{ ************** Operating-System Independent Unicode Process }
|
|
|
|
// freely inspired by Bero's PUCU library, released under zlib license
|
|
// https://github.com/BeRo1985/pucu (C)2016-2020 Benjamin Rosseaux
|
|
|
|
{$define UU_COMPRESSED}
|
|
// 1KB compressed static table in the exe renders into our 20KB UU[] array :)
|
|
|
|
type
|
|
// 20,016 bytes for full Unicode 10.0 case folding branchless conversion
|
|
{$ifdef USERECORDWITHMETHODS}
|
|
TUnicodeUpperTable = record
|
|
{$else}
|
|
TUnicodeUpperTable = object
|
|
{$endif USERECORDWITHMETHODS}
|
|
public
|
|
Block: array[0..37, 0..127] of integer;
|
|
IndexHi: array[0..271] of byte;
|
|
IndexLo: array[0..8, 0..31] of byte;
|
|
// branchless Unicode 10.0 uppercase folding using our internal tables
|
|
// caller should have checked that c <= UU_MAX
|
|
function Ucs4Upper(c: PtrUInt): PtrUInt;
|
|
{$ifdef HASINLINE} inline; {$endif}
|
|
end;
|
|
{$ifndef CPUX86NOTPIC}
|
|
PUnicodeUpperTable = ^TUnicodeUpperTable;
|
|
{$endif CPUX86NOTPIC}
|
|
|
|
const
|
|
UU_BLOCK_HI = 7;
|
|
UU_BLOCK_LO = 127;
|
|
UU_INDEX_HI = 5;
|
|
UU_INDEX_LO = 31;
|
|
UU_MAX = $10ffff;
|
|
|
|
var
|
|
{$ifdef UU_COMPRESSED}
|
|
UU: TUnicodeUpperTable;
|
|
{$else}
|
|
UU: TUnicodeUpperTable = (
|
|
Block: (
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
|
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 743, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -32, -32, -
|
|
32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
|
-32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121),
|
|
(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -232, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, -300),
|
|
(195, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 0,
|
|
0, 97, 0, 0, 0, -1, 163, 0, 0, 0, 130, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0,
|
|
0, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 56, 0,
|
|
0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, -1, 0, -1, 0, -1, 0, -
|
|
1, 0, -1, 0, -1, 0, -1, -79, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1),
|
|
(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 10815,
|
|
10815, 0, -1, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 10783, 10780,
|
|
10782, -210, -206, 0, -205, -205, 0, -202, 0, -203, 42319, 0, 0, 0, -205,
|
|
42315, 0, -207, 0, 42280, 42308, 0, -209, -211, 42308, 10743, 42305, 0, 0,
|
|
-211, 0, 10749, -213, 0, 0, -214, 0, 0, 0, 0, 0, 0, 0, 10727, 0, 0),
|
|
(-218, 0, 0, -218, 0, 0, 0, 42282, -218, -69, -217, -217, -71, 0, 0, 0, 0, 0,
|
|
-219, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42261, 42258, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, 130, 130, 130, 0, 0),
|
|
(0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38, -37, -37, -37, 0, -32,
|
|
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
|
-32, -31, -32, -32, -32, -32, -32, -32, -32, -32, -32, -64, -63, -63, 0,
|
|
-62, -57, 0, 0, 0, -47, -54, -8, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -86, -80, 7, -116, 0, -96, 0, 0,
|
|
-1, 0, 0, -1, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
|
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
|
-32, -32, -32, -32, -32, -32, -80, -80, -80, -80, -80, -80, -80, -80, -80,
|
|
-80, -80, -80, -80, -80, -80, -80, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1),
|
|
(0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -15, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1),
|
|
(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48,
|
|
-48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48,
|
|
-48, -48, -48, -48, -48),
|
|
(-48, -48, -48, -48, -48, -48, -48, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8,
|
|
-8, -8, -8, 0, 0),
|
|
(-6254, -6253, -6244, -6242, -6242, -6243, -6236, -6181, 35266, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 35332, 0, 0, 0, 3814, 0, 0),
|
|
(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1),
|
|
(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, 0, 0, 0, 0, -59, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1),
|
|
(8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8,
|
|
8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 74, 74, 86, 86, 86, 86, 100, 100,
|
|
128, 128, 112, 112, 126, 126, 0, 0),
|
|
(8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8,
|
|
8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
-7205, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -28, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
|
|
-16, -16, -16, -16, -16),
|
|
(0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -26, -26, -26, -26,
|
|
-26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26, -26,
|
|
-26, -26, -26, -26, -26, -26, -26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48,
|
|
-48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48,
|
|
-48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48, -48,
|
|
-48, -48, -48, -48, -48, -48, -48, 0, 0, -1, 0, 0, 0, -10795, -10792, 0, -1,
|
|
0, -1, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0,
|
|
0, -1, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(-7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264,
|
|
-7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264,
|
|
-7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264, -7264,
|
|
-7264, -7264, -7264, -7264, -7264, 0, -7264, 0, 0, 0, 0, 0, -7264, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, -1),
|
|
(0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, 0, -1, 0, 0,
|
|
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -928, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38864, -38864,
|
|
-38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864,
|
|
-38864, -38864, -38864, -38864, -38864),
|
|
(-38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864,
|
|
-38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864,
|
|
-38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864,
|
|
-38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864,
|
|
-38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864,
|
|
-38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864,
|
|
-38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864, -38864,
|
|
-38864, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -32, -32, -32, -32, -32, -32,
|
|
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
|
-32, -32, -32, -32, -32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -40, -40, -40, -40,
|
|
-40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40,
|
|
-40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40,
|
|
-40, -40, -40, -40, -40, -40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -40, -40,
|
|
-40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40,
|
|
-40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40, -40,
|
|
-40, -40, -40, -40, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
-64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64,
|
|
-64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64,
|
|
-64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64, -64,
|
|
-64, -64, -64, -64, -64, -64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -32, -32, -32, -32, -32, -32, -32,
|
|
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
|
|
-32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
|
|
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34,
|
|
-34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34, -34,
|
|
-34, -34, -34, -34, -34, -34, -34, -34, -34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
|
|
);
|
|
IndexHi: (0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 5, 6, 7, 3, 3, 3, 3,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
|
|
IndexLo: ((0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12, 12, 12, 12,
|
|
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), (12, 12, 12, 12, 12,
|
|
12, 12, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
|
12, 14, 15, 12, 16, 17, 18, 19), (12, 12, 20, 21, 12, 12, 12, 12, 12, 22,
|
|
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 23, 24, 25, 12, 12,
|
|
12, 12, 12), (12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
|
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), (12,
|
|
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 26, 27, 28, 29, 12, 12, 12, 12,
|
|
12, 12, 30, 31, 12, 12, 12, 12, 12, 12, 12, 12), (12, 12, 12, 12, 12, 12,
|
|
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
|
12, 12, 12, 12, 12, 32, 12), (12, 12, 12, 12, 12, 12, 12, 12, 33, 34, 12,
|
|
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 35, 12, 12, 12, 12,
|
|
12, 12), (12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
|
12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12), (12, 12,
|
|
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 37, 12, 12,
|
|
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12));
|
|
);
|
|
{$endif UU_COMPRESSED}
|
|
|
|
function TUnicodeUpperTable.Ucs4Upper(c: PtrUInt): PtrUInt;
|
|
var
|
|
i: PtrUInt;
|
|
begin
|
|
// branchless conversion of 0..UU_MAX = $10ffff Unicode codepoints
|
|
i := c shr UU_BLOCK_HI;
|
|
result := PtrInt(c) +
|
|
Block[IndexLo[IndexHi[i shr UU_INDEX_HI], i and UU_INDEX_LO],
|
|
c and UU_BLOCK_LO];
|
|
end;
|
|
|
|
function Utf8UpperReference(S, D: PUtf8Char): PUtf8Char;
|
|
var
|
|
c: PtrUInt;
|
|
S2: PUtf8Char;
|
|
{$ifdef CPUX86NOTPIC}
|
|
tab: TUnicodeUpperTable absolute UU;
|
|
{$else}
|
|
tab: PUnicodeUpperTable;
|
|
{$endif CPUX86NOTPIC}
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
tab := @UU;
|
|
{$endif CPUX86NOTPIC}
|
|
if S <> nil then
|
|
repeat
|
|
c := ord(S^);
|
|
if c <= 127 then
|
|
if c = 0 then
|
|
break
|
|
else
|
|
begin
|
|
inc(c, tab.Block[0, c]); // branchless a..z -> A..Z
|
|
D^ := AnsiChar(c);
|
|
inc(S);
|
|
inc(D);
|
|
continue;
|
|
end
|
|
else if c and $20 = 0 then
|
|
begin
|
|
c := (c shl 6) + byte(S[1]) - $3080; // fast process $0..$7ff
|
|
inc(S, 2);
|
|
end
|
|
else
|
|
begin
|
|
S2 := S;
|
|
c := UTF8_TABLE.GetHighUtf8Ucs4(S2); // handle even surrogates
|
|
S := S2;
|
|
if c = 0 then
|
|
c := UNICODE_REPLACEMENT_CHARACTER; // =$fffd for invalid input
|
|
end;
|
|
if c <= UU_MAX then
|
|
c := tab.Ucs4Upper(c);
|
|
inc(D, Ucs4ToUtf8(c, D));
|
|
until false;
|
|
D^ := #0;
|
|
result := D;
|
|
end;
|
|
|
|
function Utf8UpperReference(S, D: PUtf8Char; SLen: PtrUInt): PUtf8Char;
|
|
var
|
|
c: PtrUInt;
|
|
endSBy4: PUtf8Char;
|
|
extra, i: PtrInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
tab: TUnicodeUpperTable absolute UU;
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
tab: PUnicodeUpperTable;
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
by1, by4; // ugly but faster
|
|
begin
|
|
if (S <> nil) and
|
|
(D <> nil) then
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
tab := @UU;
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
// first handle trailing 7-bit ASCII chars, by quad
|
|
inc(SLen, PtrUInt(S));
|
|
endSBy4 := PUtf8Char(SLen) - 4;
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(S) and 3 = 0) and {$endif}
|
|
(S <= endSBy4) then
|
|
repeat
|
|
if PCardinal(S)^ and $80808080 <> 0 then
|
|
goto by1; // break on first non ASCII quad
|
|
by4: i := byte(S[0]);
|
|
inc(i, tab.Block[0, i]); // branchless a..z -> A..Z
|
|
D[0] := AnsiChar(i);
|
|
i := byte(S[1]);
|
|
inc(i, tab.Block[0, i]);
|
|
D[1] := AnsiChar(i);
|
|
i := byte(S[2]);
|
|
inc(i, tab.Block[0, i]);
|
|
D[2] := AnsiChar(i);
|
|
i := byte(S[3]);
|
|
inc(i, tab.Block[0, i]);
|
|
D[3] := AnsiChar(i);
|
|
inc(S, 4);
|
|
inc(D, 4);
|
|
until S > endSBy4;
|
|
// generic loop, handling one UCS4 CodePoint per iteration
|
|
if S < PUtf8Char(SLen) then
|
|
repeat
|
|
by1: c := byte(S^);
|
|
inc(S);
|
|
if c <= 127 then
|
|
begin
|
|
inc(c, tab.Block[0, c]); // branchless a..z -> A..Z
|
|
D^ := AnsiChar(c);
|
|
inc(D);
|
|
if {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}(PtrUInt(S) and 3 = 0) and{$endif}
|
|
(S <= endSBy4) then
|
|
if PCardinal(S)^ and $80808080 = 0 then
|
|
goto By4
|
|
else
|
|
continue
|
|
else if S < PUtf8Char(SLen) then
|
|
continue
|
|
else
|
|
break;
|
|
end
|
|
else
|
|
begin
|
|
extra := utf8.Lookup[c];
|
|
if (extra = UTF8_INVALID) or
|
|
(S + extra > PUtf8Char(SLen)) then
|
|
break;
|
|
i := 0;
|
|
repeat
|
|
c := (c shl 6) + byte(S[i]);
|
|
inc(i)
|
|
until i = extra;
|
|
inc(S, extra);
|
|
with utf8.Extra[extra] do
|
|
begin
|
|
dec(c, offset);
|
|
if c < minimum then
|
|
break; // invalid input content
|
|
end;
|
|
if c <= UU_MAX then
|
|
c := tab.Ucs4Upper(c);
|
|
inc(D, Ucs4ToUtf8(c, D));
|
|
if S < PUtf8Char(SLen) then
|
|
continue
|
|
else
|
|
break;
|
|
end;
|
|
until false;
|
|
D^ := #0;
|
|
end;
|
|
result := D;
|
|
end;
|
|
|
|
function UpperCaseReference(const S: RawUtf8): RawUtf8;
|
|
var
|
|
len: integer;
|
|
tmp: TSynTempBuffer;
|
|
begin
|
|
len := length(S);
|
|
tmp.Init(len * 2); // some codepoints enhance in length
|
|
tmp.Done(Utf8UpperReference(pointer(S), tmp.buf, len), result);
|
|
end;
|
|
|
|
function UpperCaseUcs4Reference(const S: RawUtf8): RawUcs4;
|
|
var
|
|
c, n: PtrUInt;
|
|
U: PUtf8Char;
|
|
begin
|
|
if S = '' then
|
|
begin
|
|
result := nil;
|
|
exit;
|
|
end;
|
|
SetLength(result, length(S) + 1);
|
|
U := pointer(S);
|
|
n := 0;
|
|
repeat
|
|
c := NextUtf8Ucs4(U);
|
|
if c = 0 then
|
|
break;
|
|
if c <= UU_MAX then
|
|
c := UU.Ucs4Upper(c);
|
|
result[n] := c;
|
|
inc(n);
|
|
until false;
|
|
if n = 0 then
|
|
result := nil
|
|
else
|
|
begin
|
|
result[n] := 0; // always end with a 0
|
|
DynArrayFakeLength(result, n); // faster than SetLength()
|
|
end;
|
|
end;
|
|
|
|
function Utf8ICompReference(u1, u2: PUtf8Char): PtrInt;
|
|
var
|
|
c2: PtrInt;
|
|
{$ifdef CPUX86NOTPIC}
|
|
tab: TUnicodeUpperTable absolute UU;
|
|
{$else}
|
|
tab: PUnicodeUpperTable;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
c2low;
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
tab := @UU;
|
|
{$endif CPUX86NOTPIC}
|
|
if u1 <> u2 then
|
|
if u1 <> nil then
|
|
if u2 <> nil then
|
|
repeat
|
|
result := ord(u1^);
|
|
c2 := ord(u2^);
|
|
if result <= 127 then
|
|
if result <> 0 then
|
|
begin
|
|
inc(u1);
|
|
inc(result, tab.Block[0, result]); // branchless a..z -> A..Z
|
|
if c2 <= 127 then
|
|
begin
|
|
c2low: if c2 = 0 then
|
|
exit; // u1>u2 -> return u1^
|
|
inc(u2);
|
|
inc(c2, tab.Block[0, c2]);
|
|
dec(result, c2);
|
|
if result <> 0 then
|
|
exit;
|
|
continue;
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
// result=u1^=#0 -> end of u1 reached
|
|
if c2 <> 0 then // end of u2 reached -> u1=u2 -> return 0
|
|
result := -1; // u1<u2
|
|
exit;
|
|
end
|
|
else
|
|
begin
|
|
// fast Unicode 10.0 uppercase conversion
|
|
if result and $20 = 0 then // $0..$7ff common case
|
|
begin
|
|
result := (result shl 6) + byte(u1[1]) - $3080;
|
|
inc(u1, 2);
|
|
end
|
|
else
|
|
result := UTF8_TABLE.GetHighUtf8Ucs4(u1);
|
|
if PtrUInt(result) <= UU_MAX then
|
|
result := tab.Ucs4Upper(result);
|
|
end;
|
|
if c2 <= 127 then
|
|
goto c2low
|
|
else if c2 and $20 = 0 then // $0..$7ff common case
|
|
begin
|
|
c2 := (c2 shl 6) + byte(u2[1]) - $3080;
|
|
inc(u2, 2);
|
|
end
|
|
else
|
|
c2 := UTF8_TABLE.GetHighUtf8Ucs4(u2);
|
|
if PtrUInt(c2) <= UU_MAX then
|
|
c2 := tab.Ucs4Upper(c2);
|
|
dec(result, c2);
|
|
if result <> 0 then
|
|
exit;
|
|
until false
|
|
else
|
|
result := 1 // u2=''
|
|
else
|
|
result := -1 // u1=''
|
|
else
|
|
result := 0; // u1=u2
|
|
end;
|
|
|
|
function Utf8ILCompReference(u1, u2: PUtf8Char; L1, L2: integer): PtrInt;
|
|
var
|
|
c2: PtrInt;
|
|
extra, i: integer;
|
|
{$ifdef CPUX86NOTPIC}
|
|
tab: TUnicodeUpperTable absolute UU;
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
tab: PUnicodeUpperTable;
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
neg, pos;
|
|
begin
|
|
{$ifndef CPUX86NOTPIC}
|
|
tab := @UU;
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
if u1 <> u2 then
|
|
if (u1 <> nil) and
|
|
(L1 <> 0) then
|
|
if (u2 <> nil) and
|
|
(L2 <> 0) then
|
|
repeat
|
|
result := ord(u1^);
|
|
c2 := ord(u2^);
|
|
inc(u1);
|
|
dec(L1);
|
|
if result <= 127 then
|
|
begin
|
|
inc(result, tab.Block[0, result]); // branchless a..z -> A..Z
|
|
if c2 <= 127 then
|
|
begin
|
|
inc(c2, tab.Block[0, c2]);
|
|
dec(L2);
|
|
inc(u2);
|
|
dec(result, c2);
|
|
if result <> 0 then
|
|
// found unmatching char
|
|
exit
|
|
else if L1 <> 0 then
|
|
if L2 <> 0 then
|
|
// L1>0 and L2>0 -> next char
|
|
continue
|
|
else
|
|
// L1>0 and L2=0 -> u1>u2
|
|
goto pos
|
|
else
|
|
if L2 <> 0 then
|
|
// L1=0 and L2>0 -> u1<u2
|
|
goto neg
|
|
else
|
|
// L1=0 and L2=0 -> u1=u2 -> returns 0
|
|
exit;
|
|
end;
|
|
end
|
|
else
|
|
begin
|
|
// fast Unicode 10.0 uppercase conversion
|
|
extra := utf8.Lookup[result];
|
|
if extra = UTF8_INVALID then
|
|
// invalid leading byte
|
|
goto neg;
|
|
dec(L1, extra);
|
|
if L1 < 0 then
|
|
goto neg;
|
|
i := 0;
|
|
repeat
|
|
result := result shl 6;
|
|
inc(result, ord(u1[i]));
|
|
inc(i);
|
|
until i = extra;
|
|
inc(u1, extra);
|
|
dec(result, utf8.Extra[extra].offset);
|
|
if PtrUInt(result) <= UU_MAX then
|
|
result := tab.Ucs4Upper(result);
|
|
end;
|
|
// here result=NormToUpper[u1^]
|
|
inc(u2);
|
|
dec(L2);
|
|
if c2 <= 127 then
|
|
begin
|
|
inc(c2, tab.Block[0, c2]);
|
|
dec(result, c2);
|
|
if result <> 0 then
|
|
// found unmatching codepoint
|
|
exit;
|
|
end
|
|
else
|
|
begin
|
|
extra := utf8.Lookup[c2];
|
|
if extra = UTF8_INVALID then
|
|
goto pos;
|
|
dec(L2, extra);
|
|
if L2 < 0 then
|
|
goto pos;
|
|
i := 0;
|
|
repeat
|
|
c2 := c2 shl 6;
|
|
inc(c2, ord(u2[i]));
|
|
inc(i);
|
|
until i = extra;
|
|
inc(u2, extra);
|
|
dec(c2, utf8.Extra[extra].offset);
|
|
if PtrUInt(c2) <= UU_MAX then
|
|
c2 := tab.Ucs4Upper(c2);
|
|
dec(result, c2);
|
|
if result <> 0 then
|
|
// found unmatching codepoint
|
|
exit;
|
|
end;
|
|
// here we have result=0
|
|
if L1 = 0 then
|
|
// test if we reached end of u1 or end of u2
|
|
if L2 = 0 then
|
|
// u1=u2
|
|
exit
|
|
else
|
|
// u1<u2
|
|
goto neg
|
|
else
|
|
if L2 = 0 then
|
|
// u1>u2
|
|
goto pos;
|
|
until false
|
|
else
|
|
pos: // u2='' or u1>u2
|
|
result := 1
|
|
else
|
|
neg: // u1='' or u1<u2
|
|
result := -1
|
|
else
|
|
// u1=u2
|
|
result := 0;
|
|
end;
|
|
|
|
function StrPosIReference(U: PUtf8Char; const Up: RawUcs4): PUtf8Char;
|
|
var
|
|
c, extra, i: PtrInt;
|
|
u0, u2: PUtf8Char;
|
|
up2: PInteger;
|
|
{$ifdef CPUX86NOTPIC}
|
|
tab: TUnicodeUpperTable absolute UU;
|
|
utf8: TUtf8Table absolute UTF8_TABLE;
|
|
{$else}
|
|
tab: PUnicodeUpperTable;
|
|
utf8: PUtf8Table;
|
|
{$endif CPUX86NOTPIC}
|
|
label
|
|
nxt;
|
|
begin
|
|
result := nil;
|
|
if (U = nil) or
|
|
(Up = nil) then
|
|
exit;
|
|
{$ifndef CPUX86NOTPIC}
|
|
tab := @UU;
|
|
utf8 := @UTF8_TABLE;
|
|
{$endif CPUX86NOTPIC}
|
|
repeat
|
|
// fast search for the first character
|
|
nxt:u0 := U;
|
|
c := byte(U^);
|
|
inc(U);
|
|
if c <= 127 then
|
|
begin
|
|
if c = 0 then
|
|
exit; // not found -> return nil
|
|
inc(c, tab.Block[0, c]); // branchless a..z -> A..Z
|
|
if c <> Up[0] then
|
|
continue;
|
|
end
|
|
else
|
|
begin
|
|
extra := utf8.Lookup[c];
|
|
if extra = UTF8_INVALID then
|
|
exit;
|
|
i := 0;
|
|
repeat
|
|
c := c shl 6;
|
|
inc(c, ord(U[i]));
|
|
inc(i);
|
|
until i = extra;
|
|
inc(U, extra);
|
|
dec(c, utf8.Extra[extra].offset);
|
|
if PtrUInt(c) <= UU_MAX then
|
|
c := tab.Ucs4Upper(c);
|
|
if c <> Up[0] then
|
|
continue;
|
|
end;
|
|
// if we reached here, U^ and Up^ first UCS4 CodePoint do match
|
|
u2 := U;
|
|
up2 := @Up[1];
|
|
repeat
|
|
if up2^ = 0 then
|
|
begin
|
|
result := u0; // found -> return position in U
|
|
exit;
|
|
end;
|
|
c := byte(u2^);
|
|
inc(u2);
|
|
if c <= 127 then
|
|
begin
|
|
if c = 0 then
|
|
exit; // not found -> return nil
|
|
inc(c, tab.Block[0, c]);
|
|
if c <> up2^ then
|
|
goto nxt;
|
|
inc(up2);
|
|
end
|
|
else
|
|
begin
|
|
extra := utf8.Lookup[c];
|
|
if extra = UTF8_INVALID then
|
|
exit; // invalid input
|
|
i := 0;
|
|
repeat
|
|
c := c shl 6;
|
|
inc(c, ord(u2[i]));
|
|
inc(i);
|
|
until i = extra;
|
|
inc(u2, extra);
|
|
dec(c, utf8.Extra[extra].offset);
|
|
if PtrUInt(c) <= UU_MAX then
|
|
c := tab.Ucs4Upper(c);
|
|
if c <> up2^ then
|
|
goto nxt;
|
|
inc(up2);
|
|
end;
|
|
until false;
|
|
until false;
|
|
end;
|
|
|
|
|
|
|
|
|
|
const
|
|
// reference 8-bit upper chars as in WinAnsi/CP1252 for NormToUpper/Lower[]
|
|
WinAnsiToUp: array[138..255] of byte = (
|
|
83, 139, 140, 141, 90, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
|
|
153, 83, 155, 140, 157, 90, 89, 160, 161, 162, 163, 164, 165, 166, 167,
|
|
168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,
|
|
183, 184, 185, 186, 187, 188, 189, 190, 191, 65, 65, 65, 65, 65, 65,
|
|
198, 67, 69, 69, 69, 69, 73, 73, 73, 73, 68, 78, 79, 79, 79,
|
|
79, 79, 215, 79, 85, 85, 85, 85, 89, 222, 223, 65, 65, 65, 65,
|
|
65, 65, 198, 67, 69, 69, 69, 69, 73, 73, 73, 73, 68, 78, 79,
|
|
79, 79, 79, 79, 247, 79, 85, 85, 85, 85, 89, 222, 89);
|
|
|
|
{$ifdef UU_COMPRESSED}
|
|
|
|
// 1KB compressed buffer which renders into our 20,016 bytes UU[] array
|
|
UU_: array[byte] of cardinal = (
|
|
$040019FD, $FF5A6024, $00855A00, $FFFFFFE0, $5A5201F0, $02E700E8, $FFE0AA5A,
|
|
$E0045A4B, $5A790BFF, $045A0007, $A045A1FF, $DB1878BA, $01A82B01, $0145A000,
|
|
$1DA45008, $041E5A80, $401DA450, $5A8F185A, $FFFFFED4, $590B5AC3, $0C5A84A4,
|
|
$5314A453, $610008A4, $A4520F5A, $82F5A1A3, $F1EBB5AB, $5A44DDF7, $52105A84,
|
|
$5A845AA4, $5A4A5AC4, $5A385AC6, $11A45217, $10ABA500, $45A00200, $4F5A4401,
|
|
$0000B15A, $A05A4F04, $5A830145, $C65A0018, $5EBAA05A, $20245AC0, $85A1A452,
|
|
$5BB55700, $00002A3F, $065A2A3F, $5B04A453, $1F055A40, $A11C02A1, $02A11E02,
|
|
$3200012E, $45A10001, $C2000133, $3645A10C, $45A10001, $4F000135, $00550690,
|
|
$000E5AA5, $A54B0CC2, $013165A1, $2845A100, $440000A5, $012FAC02, $00012D00,
|
|
$F70A5144, $41000029, $000A5AA5, $2AC4A16B, $45A10D22, $2B0291FD, $85A10001,
|
|
$1C00022A, $A129E700, $000226A5, $0D930008, $512A000C, $BB0D920A, $05270001,
|
|
$0001B900, $0644145A, $25000107, $00280002, $120A5115, $F1F552A5, $54009C55,
|
|
$A453AF5A, $5AC45A44, $0082000C, $5A208400, $FFDA00BB, $AD6EFFFF, $09DB15D5,
|
|
$F045A100, $01E13201, $1201F000, $C10001C0, $45A10005, $C70001C2, $C5A10001,
|
|
$CA0001D1, $01F80001, $A045A100, $01AA33BA, $0001B000, $D0000007, $001EFBFB,
|
|
$A2FFFF8C, $0002A0AB, $85A15A83, $E0D0BAA2, $01F00BFF, $2E01F012, $04F004F2,
|
|
$3645A02A, $240D45A0, $5A44A459, $847E5A40, $115A405A, $400002F1, $45A0115A,
|
|
$CC5A400D, $1FD000C4, $01255507, $8202F000, $55F1F555, $00C955F4, $700001F8,
|
|
$85A10200, $FFFFE792, $9C018193, $859E0181, $01819D01, $DB0181A4, $89C20181,
|
|
$00C5F558, $3C90F1E4, $E5A18A04, $E5A10EE6, $5A40BAA2, $CC5A40CC, $01C50014,
|
|
$A045B100, $45AAFFBA, $00000008, $5A070080, $04800023, $80002B5A, $80000604,
|
|
$00000635, $BC2F5A0E, $00F75B6D, $D875A108, $050A30A7, $014A0009, $5604A200,
|
|
$056A0001, $42000164, $00018006, $01700802, $7E070200, $A17E0001, $070080B5,
|
|
$80080001, $00022635, $35860002, $C4304825, $810975A1, $01E3DBB5, $090010A5,
|
|
$8004335A, $80053B5A, $5A07000F, $F1CA0137, $E4006C55, $84A501FF, $0001F000,
|
|
$8E2A00F0, $5AEDC05B, $55F15B04, $002F55F4, $900001E6, $F5525201, $87FFD019,
|
|
$5A1202F0, $000C5A84, $FFFFD5D5, $AA02A1D8, $1845A545, $5A84A453, $40A45328,
|
|
$5A40CC5A, $FB5FC736, $445804BF, $305B045A, $01C1A000, $A182C5E0, $B1C5E245,
|
|
$F4C5E345, $A4504E55, $A4504C77, $1E55F441, $C417A450, $405A445A, $588A9C5A,
|
|
$5A405A84, $045B0406, $C05A445B, $592C2A5A, $C6F021A4, $6E55F49B, $01FC6000,
|
|
$300070A5, $60FFFF68, $0970FF7C, $F1F55219, $FFE00655, $35F55257, $0001D800,
|
|
$528A0270, $2255F1F5, $527F7FD0, $F20011F5, $00063B03, $B603F000, $E035F552,
|
|
$01F057FF, $09F55206, $0001DE00, $5A720210, $020100F1, $0403075A, $0503045A,
|
|
$0C5A0706, $F15A0803, $00000012, $03120103, $08675104, $5A0B0A09, $5A0D0C1B,
|
|
$0F0E0C11, $1211100C, $140C0C13, $0C055A15, $00005A16, $0C0E0000, $5A191817,
|
|
$1B1A0C31, $065A1D1C, $5A1F1E0C, $5A200C26, $22210C09, $230C0F5A, $0000175A,
|
|
$240C0000, $250C205A, $000C0D5A, $00000000);
|
|
|
|
procedure InitializeUU;
|
|
var
|
|
tmp: array[0..7000] of byte; // need only 6653 bytes
|
|
begin
|
|
// Uppercase Unicode table RLE + SynLZ decompression from 1KB to 20KB :)
|
|
if (RleUnCompress(@tmp, @UU, SynLZdecompress1(@UU_, 1019, @tmp)) <> SizeOf(UU)) or
|
|
(crc32c(0, @UU, SizeOf(UU)) <> $7343D053) then
|
|
raise ESynUnicode.Create('UU Table Decompression Failed'); // paranoid
|
|
end;
|
|
|
|
{$endif UU_COMPRESSED}
|
|
|
|
(*
|
|
procedure doUU;
|
|
var
|
|
tmp1, tmp2: array[0..5500] of cardinal;
|
|
rle, lz, i: PtrInt;
|
|
l: RawUtf8;
|
|
begin
|
|
rle := RleCompress(@UU, @tmp1, SizeOf(UU), SizeOf(tmp1));
|
|
lz := SynLZCompress1(@tmp1, rle, @tmp2);
|
|
writeln(SizeOf(UU)); writeln(rle); writeln(lz);
|
|
writeln('UU_ = array[byte] of cardinal = ('); l := ' ';
|
|
for i := 0 to 255 do
|
|
begin
|
|
l := l + '$' + HexStr(tmp2[i], 8) + ',';
|
|
if length(l) > 70 then
|
|
begin
|
|
writeln(l);
|
|
l := ' ';
|
|
end;
|
|
end;
|
|
writeln(l, ');');
|
|
end;
|
|
*)
|
|
|
|
procedure InitializeUnit;
|
|
var
|
|
i: PtrInt;
|
|
c: AnsiChar;
|
|
begin
|
|
// decompress 1KB static in the exe into 20KB UU[] array for Unicode Uppercase
|
|
{$ifdef UU_COMPRESSED}
|
|
InitializeUU;
|
|
{$endif UU_COMPRESSED}
|
|
// initialize internal lookup tables for various text conversions
|
|
for i := 0 to 255 do
|
|
NormToNormByte[i] := i;
|
|
NormToUpperAnsi7Byte := NormToNormByte;
|
|
for i := ord('a') to ord('z') do
|
|
dec(NormToUpperAnsi7Byte[i], 32);
|
|
NormToLowerAnsi7Byte := NormToNormByte;
|
|
for i := ord('A') to ord('Z') do
|
|
inc(NormToLowerAnsi7Byte[i], 32);
|
|
MoveFast(NormToUpperAnsi7, NormToUpper, 138);
|
|
MoveFast(WinAnsiToUp, NormToUpperByte[138], SizeOf(WinAnsiToUp));
|
|
for i := 0 to 255 do
|
|
begin
|
|
c := NormToUpper[AnsiChar(i)];
|
|
if c in ['A'..'Z'] then
|
|
inc(c, 32);
|
|
NormToLower[AnsiChar(i)] := c;
|
|
end;
|
|
for c := low(c) to high(c) do
|
|
begin
|
|
if not (c in [#0, #10, #13]) then
|
|
include(TEXT_CHARS[c], tcNot01013);
|
|
if c in [#10, #13] then
|
|
include(TEXT_CHARS[c], tc1013);
|
|
if c in ['0'..'9', 'a'..'z', 'A'..'Z'] then
|
|
include(TEXT_CHARS[c], tcWord);
|
|
if c in ['_', 'a'..'z', 'A'..'Z'] then
|
|
include(TEXT_CHARS[c], tcIdentifierFirstChar);
|
|
if c in ['_', '0'..'9', 'a'..'z', 'A'..'Z'] then
|
|
include(TEXT_CHARS[c], tcIdentifier);
|
|
if c in ['_', '-', '.', '0'..'9', 'a'..'z', 'A'..'Z'] then
|
|
// '~' is part of the RFC 3986 but should be escaped in practice
|
|
// see https://blog.synopse.info/?post/2020/08/11/The-RFC%2C-The-URI%2C-and-The-Tilde
|
|
include(TEXT_CHARS[c], tcUriUnreserved);
|
|
if c in [#1..#9, #11, #12, #14..' '] then
|
|
include(TEXT_CHARS[c], tcCtrlNotLF);
|
|
if c in [#1..' ', ';'] then
|
|
include(TEXT_CHARS[c], tcCtrlNot0Comma);
|
|
end;
|
|
// setup sorting functions redirection
|
|
StrCompByCase[false] := @StrComp;
|
|
StrCompByCase[true] := @StrIComp;
|
|
{$ifdef CPUINTEL}
|
|
SortDynArrayAnsiStringByCase[false] := @SortDynArrayAnsiString;
|
|
{$else}
|
|
SortDynArrayAnsiStringByCase[false] := @SortDynArrayRawByteString;
|
|
{$endif CPUINTEL}
|
|
SortDynArrayAnsiStringByCase[true] := @SortDynArrayAnsiStringI;
|
|
IdemPropNameUSameLen[false] := @IdemPropNameUSameLenNotNull;
|
|
IdemPropNameUSameLen[true] := @mormot.core.base.CompareMem;
|
|
|
|
// setup basic Unicode conversion engines
|
|
SetLength(SynAnsiConvertListCodePage, 16); // no resize -> more thread safe
|
|
CurrentAnsiConvert := NewEngine(Unicode_CodePage);
|
|
WinAnsiConvert := NewEngine(CP_WINANSI) as TSynAnsiFixedWidth;
|
|
Utf8AnsiConvert := NewEngine(CP_UTF8) as TSynAnsiUtf8;
|
|
RawByteStringConvert := NewEngine(CP_RAWBYTESTRING) as TSynAnsiFixedWidth;
|
|
// setup optimized ASM functions
|
|
IsValidUtf8Buffer := @IsValidUtf8Pas;
|
|
{$ifdef ASMX64AVXNOCONST}
|
|
if cpuHaswell in X64CpuFeatures then
|
|
// Haswell CPUs can use simdjson AVX2 asm for IsValidUtf8()
|
|
IsValidUtf8Buffer := @IsValidUtf8Avx2;
|
|
{$endif ASMX64AVXNOCONST}
|
|
end;
|
|
|
|
|
|
initialization
|
|
InitializeUnit;
|
|
|
|
|
|
end.
|