X-Git-Url: https://deadsoftware.ru/gitweb?a=blobdiff_plain;f=src%2Fshared%2Futils.pas;h=10f7f930faa92089fd260571492dc767caf9f40c;hb=6e52b3dc2eb459c745e571dfef5afb5c2ccd546f;hp=253e838961316e5bd5e2b19595ba10d7760e033a;hpb=c3be56f2a3849cd22be39ce594498c7990e05606;p=d2df-editor.git diff --git a/src/shared/utils.pas b/src/shared/utils.pas index 253e838..10f7f93 100644 --- a/src/shared/utils.pas +++ b/src/shared/utils.pas @@ -2,8 +2,7 @@ * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. + * the Free Software Foundation, version 3 of the License ONLY. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -13,7 +12,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . *) -{$MODE DELPHI} +{$INCLUDE a_modes.inc} unit utils; interface @@ -22,6 +21,36 @@ uses SysUtils, Classes; +// ////////////////////////////////////////////////////////////////////////// // +type + TUtf8DecoderFast = packed record + public + const Replacement = $FFFD; // replacement char for invalid unicode + const Accept = 0; + const Reject = 12; + + private + state: LongWord; + + public + codepoint: LongWord; // decoded codepoint (valid only when decoder is in "complete" state) + + public + constructor Create (v: Boolean{fuck you, fpc}); + + procedure reset (); inline; + + function complete (): Boolean; inline; // is current character complete? take `codepoint` then + function invalid (): Boolean; inline; + function completeOrInvalid (): Boolean; inline; + + // process one byte, return `true` if codepoint is ready + function decode (b: Byte): Boolean; inline; overload; + function decode (c: AnsiChar): Boolean; inline; overload; + end; + + +// ////////////////////////////////////////////////////////////////////////// // // does filename have one of ".wad", ".pk3", ".zip" extensions? function hasWadExtension (fn: AnsiString): Boolean; @@ -34,8 +63,10 @@ function addWadExtension (fn: AnsiString): AnsiString; // convert number to strig with nice commas function Int64ToStrComma (i: Int64): AnsiString; -function UpCase1251 (ch: Char): Char; -function LoCase1251 (ch: Char): Char; +function UpCase1251 (ch: AnsiChar): AnsiChar; inline; +function LoCase1251 (ch: AnsiChar): AnsiChar; inline; + +function toLowerCase1251 (const s: AnsiString): AnsiString; // `true` if strings are equal; ignoring case for cp1251 function StrEquCI1251 (const s0, s1: AnsiString): Boolean; @@ -48,6 +79,7 @@ function utf8to1251 (s: AnsiString): AnsiString; // `lastIsDir` should be `true` if we are searching for directory // nobody cares about shitdoze, so i'll use the same code path for it function findFileCI (var pathname: AnsiString; lastIsDir: Boolean=false): Boolean; +function findFileCIStr (pathname: AnsiString): AnsiString; // they throws function openDiskFileRO (pathname: AnsiString): TStream; @@ -92,9 +124,283 @@ function readInt64BE (st: TStream): Int64; function readUInt64BE (st: TStream): UInt64; +type + TFormatStrFCallback = procedure (constref buf; len: SizeUInt); + +function wchar2win (wc: WideChar): AnsiChar; inline; +function utf2win (const s: AnsiString): AnsiString; +function win2utf (const s: AnsiString): AnsiString; +function digitInBase (ch: AnsiChar; base: Integer): Integer; + +// returns string in single or double quotes +// single quotes supports only pascal-style '' for single quote char +// double quotes supports c-style escapes +// function will select quote mode automatically +function quoteStr (const s: AnsiString): AnsiString; + + +// ////////////////////////////////////////////////////////////////////////// // +var + wc2shitmap: array[0..65535] of AnsiChar; + wc2shitmapInited: Boolean = false; + + +// ////////////////////////////////////////////////////////////////////////// // +const + cp1251: array[0..127] of Word = ( + $0402,$0403,$201A,$0453,$201E,$2026,$2020,$2021,$20AC,$2030,$0409,$2039,$040A,$040C,$040B,$040F, + $0452,$2018,$2019,$201C,$201D,$2022,$2013,$2014,$003F,$2122,$0459,$203A,$045A,$045C,$045B,$045F, + $00A0,$040E,$045E,$0408,$00A4,$0490,$00A6,$00A7,$0401,$00A9,$0404,$00AB,$00AC,$00AD,$00AE,$0407, + $00B0,$00B1,$0406,$0456,$0491,$00B5,$00B6,$00B7,$0451,$2116,$0454,$00BB,$0458,$0405,$0455,$0457, + $0410,$0411,$0412,$0413,$0414,$0415,$0416,$0417,$0418,$0419,$041A,$041B,$041C,$041D,$041E,$041F, + $0420,$0421,$0422,$0423,$0424,$0425,$0426,$0427,$0428,$0429,$042A,$042B,$042C,$042D,$042E,$042F, + $0430,$0431,$0432,$0433,$0434,$0435,$0436,$0437,$0438,$0439,$043A,$043B,$043C,$043D,$043E,$043F, + $0440,$0441,$0442,$0443,$0444,$0445,$0446,$0447,$0448,$0449,$044A,$044B,$044C,$044D,$044E,$044F + ); + implementation +procedure initShitMap (); +var + f: Integer; +begin + for f := 0 to High(wc2shitmap) do wc2shitmap[f] := '?'; + for f := 0 to 127 do wc2shitmap[f] := AnsiChar(f); + for f := 0 to 127 do wc2shitmap[cp1251[f]] := AnsiChar(f+128); + wc2shitmapInited := true; +end; + + +// ////////////////////////////////////////////////////////////////////////// // +// fast state-machine based UTF-8 decoder; using 8 bytes of memory +// code points from invalid range will never be valid, this is the property of the state machine +const + // see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + utf8dfa: array[0..$16c-1] of Byte = ( + // maps bytes to character classes + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 00-0f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 10-1f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 20-2f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 30-3f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 40-4f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 50-5f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 60-6f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 70-7f + $01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01, // 80-8f + $09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09, // 90-9f + $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07, // a0-af + $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07, // b0-bf + $08,$08,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02, // c0-cf + $02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02, // d0-df + $0a,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$04,$03,$03, // e0-ef + $0b,$06,$06,$06,$05,$08,$08,$08,$08,$08,$08,$08,$08,$08,$08,$08, // f0-ff + // maps a combination of a state of the automaton and a character class to a state + $00,$0c,$18,$24,$3c,$60,$54,$0c,$0c,$0c,$30,$48,$0c,$0c,$0c,$0c, // 100-10f + $0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$00,$0c,$0c,$0c,$0c,$0c,$00, // 110-11f + $0c,$00,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$18,$0c,$18,$0c,$0c, // 120-12f + $0c,$0c,$0c,$0c,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$18,$0c,$0c, // 130-13f + $0c,$0c,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$24, // 140-14f + $0c,$24,$0c,$0c,$0c,$24,$0c,$0c,$0c,$0c,$0c,$24,$0c,$24,$0c,$0c, // 150-15f + $0c,$24,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c); + + +// ////////////////////////////////////////////////////////////////////////// // +constructor TUtf8DecoderFast.Create (v: Boolean{fuck you, fpc}); begin state := Accept; codepoint := 0; end; + +procedure TUtf8DecoderFast.reset (); inline; begin state := Accept; codepoint := 0; end; + +function TUtf8DecoderFast.complete (): Boolean; inline; begin result := (state = Accept); end; +function TUtf8DecoderFast.invalid (): Boolean; inline; begin result := (state = Reject); end; +function TUtf8DecoderFast.completeOrInvalid (): Boolean; inline; begin result := (state = Accept) or (state = Reject); end; + +function TUtf8DecoderFast.decode (c: AnsiChar): Boolean; inline; overload; begin result := decode(Byte(c)); end; + +function TUtf8DecoderFast.decode (b: Byte): Boolean; inline; overload; +var + tp: LongWord; +begin + if (state = Reject) then begin state := Accept; codepoint := 0; end; + tp := utf8dfa[b]; + if (state <> Accept) then codepoint := (b and $3f) or (codepoint shl 6) else codepoint := ($ff shr tp) and b; + state := utf8dfa[256+state+tp]; + if (state = Reject) then begin codepoint := Replacement; state := Accept; end; + result := (state = Accept); +end; + + +// ////////////////////////////////////////////////////////////////////////// // +function wchar2win (wc: WideChar): AnsiChar; inline; +begin + if not wc2shitmapInited then initShitMap(); + if (LongWord(wc) > 65535) then result := '?' else result := wc2shitmap[LongWord(wc)]; +end; + + +// ////////////////////////////////////////////////////////////////////////// // +function utf2win (const s: AnsiString): AnsiString; +var + f, c: Integer; + ud: TUtf8DecoderFast; +begin + for f := 1 to Length(s) do + begin + if (Byte(s[f]) > 127) then + begin + ud := TUtf8DecoderFast.Create(true); + result := ''; + for c := 1 to Length(s) do + begin + if ud.decode(s[c]) then result += wchar2win(WideChar(ud.codepoint)); + end; + exit; + end; + end; + result := s; +end; + + +function win2utf (const s: AnsiString): AnsiString; +var + f, c: Integer; + + function utf8Encode (code: Integer): AnsiString; + begin + if (code < 0) or (code > $10FFFF) then begin result := '?'; exit; end; + if (code <= $7f) then + begin + result := Char(code and $ff); + end + else if (code <= $7FF) then + begin + result := Char($C0 or (code shr 6)); + result += Char($80 or (code and $3F)); + end + else if (code <= $FFFF) then + begin + result := Char($E0 or (code shr 12)); + result += Char($80 or ((code shr 6) and $3F)); + result += Char($80 or (code and $3F)); + end + else if (code <= $10FFFF) then + begin + result := Char($F0 or (code shr 18)); + result += Char($80 or ((code shr 12) and $3F)); + result += Char($80 or ((code shr 6) and $3F)); + result += Char($80 or (code and $3F)); + end + else + begin + result := '?'; + end; + end; + +begin + for f := 1 to Length(s) do + begin + if (Byte(s[f]) > 127) then + begin + result := ''; + for c := 1 to Length(s) do + begin + if (Byte(s[c]) < 128) then + begin + result += s[c]; + end + else + begin + result += utf8Encode(cp1251[Byte(s[c])-128]) + end; + end; + exit; + end; + end; + result := s; +end; + + +// ////////////////////////////////////////////////////////////////////////// // +function digitInBase (ch: AnsiChar; base: Integer): Integer; +begin + result := -1; + if (base < 1) or (base > 36) then exit; + if (ch < '0') then exit; + if (base <= 10) then + begin + if (Integer(ch) >= 48+base) then exit; + result := Integer(ch)-48; + end + else + begin + if (ch >= '0') and (ch <= '9') then begin result := Integer(ch)-48; exit; end; + if (ch >= 'a') and (ch <= 'z') then Dec(ch, 32); // poor man's tolower() + if (ch < 'A') or (Integer(ch) >= 65+(base-10)) then exit; + result := Integer(ch)-65+10; + end; +end; + +// ////////////////////////////////////////////////////////////////////////// // +function quoteStr (const s: AnsiString): AnsiString; + + function squote (const s: AnsiString): AnsiString; + var + f: Integer; + begin + result := ''''; + for f := 1 to Length(s) do + begin + if (s[f] = '''') then result += ''''; + result += s[f]; + end; + result += ''''; + end; + + function dquote (const s: AnsiString): AnsiString; + var + f: Integer; + ch: AnsiChar; + begin + result := '"'; + for f := 1 to Length(s) do + begin + ch := s[f]; + if (ch = #0) then result += '\z' + else if (ch = #9) then result += '\t' + else if (ch = #10) then result += '\n' + else if (ch = #13) then result += '\r' + else if (ch = #27) then result += '\e' + else if (ch < ' ') or (ch = #127) then + begin + result += '\x'; + result += LowerCase(IntToHex(Integer(ch), 2)); + end + else if (ch = '"') or (ch = '\') then + begin + result += '\'; + result += ch; + end + else + begin + result += ch; + end; + end; + result += '"'; + end; + +var + needSingle: Boolean = false; + f: Integer; +begin + for f := 1 to Length(s) do + begin + if (s[f] = '''') then begin needSingle := true; continue; end; + if (s[f] < ' ') or (s[f] = #127) then begin result := dquote(s); exit; end; + end; + if needSingle then result := squote(s) else result := ''''+s+''''; +end; + + +// ////////////////////////////////////////////////////////////////////////// // function hasWadExtension (fn: AnsiString): Boolean; begin fn := ExtractFileExt(fn); @@ -146,7 +452,7 @@ begin end; -function UpCase1251 (ch: Char): Char; +function UpCase1251 (ch: AnsiChar): AnsiChar; inline; begin if ch < #128 then begin @@ -170,7 +476,7 @@ begin end; -function LoCase1251 (ch: Char): Char; +function LoCase1251 (ch: AnsiChar): AnsiChar; inline; begin if ch < #128 then begin @@ -205,6 +511,26 @@ begin end; +function toLowerCase1251 (const s: AnsiString): AnsiString; +var + f: Integer; + ch: AnsiChar; +begin + for ch in s do + begin + if (ch <> LoCase1251(ch)) then + begin + result := ''; + SetLength(result, Length(s)); + for f := 1 to Length(s) do result[f] := LoCase1251(s[f]); + exit; + end; + end; + // nothing to do + result := s; +end; + + // ////////////////////////////////////////////////////////////////////////// // // utils // `ch`: utf8 start @@ -390,6 +716,11 @@ begin if result then pathname := newname; end; +function findFileCIStr (pathname: AnsiString): AnsiString; +begin + result := pathname; + findFileCI(result); +end; function openDiskFileRO (pathname: AnsiString): TStream; begin @@ -523,5 +854,5 @@ function readLongIntBE (st: TStream): LongInt; begin readIntegerBE(st, @result, function readInt64BE (st: TStream): Int64; begin readIntegerBE(st, @result, 8); end; function readUInt64BE (st: TStream): UInt64; begin readIntegerBE(st, @result, 8); end; - end. +