X-Git-Url: http://deadsoftware.ru/gitweb?p=d2df-editor.git;a=blobdiff_plain;f=src%2Fshared%2Futils.pas;h=04780bb6eae7a0ee216ab2465c71ec19d6d8b234;hp=9bb12ec632dab6bb18c979f50d589c86de54558c;hb=3dc2fe6b3d29cd54425db8f590e922f2dce50e99;hpb=212386359ea0af4e0a861f9bf92c45a1516ac87c diff --git a/src/shared/utils.pas b/src/shared/utils.pas index 9bb12ec..04780bb 100644 --- a/src/shared/utils.pas +++ b/src/shared/utils.pas @@ -13,7 +13,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . *) -{$MODE DELPHI} +{$INCLUDE a_modes.inc} unit utils; interface @@ -22,6 +22,36 @@ uses SysUtils, Classes; +// ////////////////////////////////////////////////////////////////////////// // +type + TUtf8DecoderFast = packed record + public + const Replacement = $FFFD; // replacement char for invalid unicode + const Accept = 0; + const Reject = 12; + + private + state: LongWord; + + public + codepoint: LongWord; // decoded codepoint (valid only when decoder is in "complete" state) + + public + constructor Create (v: Boolean{fuck you, fpc}); + + procedure reset (); inline; + + function complete (): Boolean; inline; // is current character complete? take `codepoint` then + function invalid (): Boolean; inline; + function completeOrInvalid (): Boolean; inline; + + // process one byte, return `true` if codepoint is ready + function decode (b: Byte): Boolean; inline; overload; + function decode (c: AnsiChar): Boolean; inline; overload; + end; + + +// ////////////////////////////////////////////////////////////////////////// // // does filename have one of ".wad", ".pk3", ".zip" extensions? function hasWadExtension (fn: AnsiString): Boolean; @@ -48,9 +78,7 @@ function utf8to1251 (s: AnsiString): AnsiString; // `lastIsDir` should be `true` if we are searching for directory // nobody cares about shitdoze, so i'll use the same code path for it function findFileCI (var pathname: AnsiString; lastIsDir: Boolean=false): Boolean; - -// returns name (the same if no file found) -function findFileCIStr (pathname: AnsiString; lastIsDir: Boolean=false): AnsiString; +function findFileCIStr (pathname: AnsiString): AnsiString; // they throws function openDiskFileRO (pathname: AnsiString): TStream; @@ -95,9 +123,283 @@ function readInt64BE (st: TStream): Int64; function readUInt64BE (st: TStream): UInt64; +type + TFormatStrFCallback = procedure (constref buf; len: SizeUInt); + +function wchar2win (wc: WideChar): AnsiChar; inline; +function utf2win (const s: AnsiString): AnsiString; +function win2utf (const s: AnsiString): AnsiString; +function digitInBase (ch: AnsiChar; base: Integer): Integer; + +// returns string in single or double quotes +// single quotes supports only pascal-style '' for single quote char +// double quotes supports c-style escapes +// function will select quote mode automatically +function quoteStr (const s: AnsiString): AnsiString; + + +// ////////////////////////////////////////////////////////////////////////// // +var + wc2shitmap: array[0..65535] of AnsiChar; + wc2shitmapInited: Boolean = false; + + +// ////////////////////////////////////////////////////////////////////////// // +const + cp1251: array[0..127] of Word = ( + $0402,$0403,$201A,$0453,$201E,$2026,$2020,$2021,$20AC,$2030,$0409,$2039,$040A,$040C,$040B,$040F, + $0452,$2018,$2019,$201C,$201D,$2022,$2013,$2014,$003F,$2122,$0459,$203A,$045A,$045C,$045B,$045F, + $00A0,$040E,$045E,$0408,$00A4,$0490,$00A6,$00A7,$0401,$00A9,$0404,$00AB,$00AC,$00AD,$00AE,$0407, + $00B0,$00B1,$0406,$0456,$0491,$00B5,$00B6,$00B7,$0451,$2116,$0454,$00BB,$0458,$0405,$0455,$0457, + $0410,$0411,$0412,$0413,$0414,$0415,$0416,$0417,$0418,$0419,$041A,$041B,$041C,$041D,$041E,$041F, + $0420,$0421,$0422,$0423,$0424,$0425,$0426,$0427,$0428,$0429,$042A,$042B,$042C,$042D,$042E,$042F, + $0430,$0431,$0432,$0433,$0434,$0435,$0436,$0437,$0438,$0439,$043A,$043B,$043C,$043D,$043E,$043F, + $0440,$0441,$0442,$0443,$0444,$0445,$0446,$0447,$0448,$0449,$044A,$044B,$044C,$044D,$044E,$044F + ); + implementation +procedure initShitMap (); +var + f: Integer; +begin + for f := 0 to High(wc2shitmap) do wc2shitmap[f] := '?'; + for f := 0 to 127 do wc2shitmap[f] := AnsiChar(f); + for f := 0 to 127 do wc2shitmap[cp1251[f]] := AnsiChar(f+128); + wc2shitmapInited := true; +end; + + +// ////////////////////////////////////////////////////////////////////////// // +// fast state-machine based UTF-8 decoder; using 8 bytes of memory +// code points from invalid range will never be valid, this is the property of the state machine +const + // see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + utf8dfa: array[0..$16c-1] of Byte = ( + // maps bytes to character classes + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 00-0f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 10-1f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 20-2f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 30-3f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 40-4f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 50-5f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 60-6f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 70-7f + $01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01, // 80-8f + $09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09, // 90-9f + $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07, // a0-af + $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07, // b0-bf + $08,$08,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02, // c0-cf + $02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02, // d0-df + $0a,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$04,$03,$03, // e0-ef + $0b,$06,$06,$06,$05,$08,$08,$08,$08,$08,$08,$08,$08,$08,$08,$08, // f0-ff + // maps a combination of a state of the automaton and a character class to a state + $00,$0c,$18,$24,$3c,$60,$54,$0c,$0c,$0c,$30,$48,$0c,$0c,$0c,$0c, // 100-10f + $0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$00,$0c,$0c,$0c,$0c,$0c,$00, // 110-11f + $0c,$00,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$18,$0c,$18,$0c,$0c, // 120-12f + $0c,$0c,$0c,$0c,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$18,$0c,$0c, // 130-13f + $0c,$0c,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$24, // 140-14f + $0c,$24,$0c,$0c,$0c,$24,$0c,$0c,$0c,$0c,$0c,$24,$0c,$24,$0c,$0c, // 150-15f + $0c,$24,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c); + + +// ////////////////////////////////////////////////////////////////////////// // +constructor TUtf8DecoderFast.Create (v: Boolean{fuck you, fpc}); begin state := Accept; codepoint := 0; end; + +procedure TUtf8DecoderFast.reset (); inline; begin state := Accept; codepoint := 0; end; + +function TUtf8DecoderFast.complete (): Boolean; inline; begin result := (state = Accept); end; +function TUtf8DecoderFast.invalid (): Boolean; inline; begin result := (state = Reject); end; +function TUtf8DecoderFast.completeOrInvalid (): Boolean; inline; begin result := (state = Accept) or (state = Reject); end; + +function TUtf8DecoderFast.decode (c: AnsiChar): Boolean; inline; overload; begin result := decode(Byte(c)); end; + +function TUtf8DecoderFast.decode (b: Byte): Boolean; inline; overload; +var + tp: LongWord; +begin + if (state = Reject) then begin state := Accept; codepoint := 0; end; + tp := utf8dfa[b]; + if (state <> Accept) then codepoint := (b and $3f) or (codepoint shl 6) else codepoint := ($ff shr tp) and b; + state := utf8dfa[256+state+tp]; + if (state = Reject) then begin codepoint := Replacement; state := Accept; end; + result := (state = Accept); +end; + + +// ////////////////////////////////////////////////////////////////////////// // +function wchar2win (wc: WideChar): AnsiChar; inline; +begin + if not wc2shitmapInited then initShitMap(); + if (LongWord(wc) > 65535) then result := '?' else result := wc2shitmap[LongWord(wc)]; +end; + + +// ////////////////////////////////////////////////////////////////////////// // +function utf2win (const s: AnsiString): AnsiString; +var + f, c: Integer; + ud: TUtf8DecoderFast; +begin + for f := 1 to Length(s) do + begin + if (Byte(s[f]) > 127) then + begin + ud := TUtf8DecoderFast.Create(true); + result := ''; + for c := 1 to Length(s) do + begin + if ud.decode(s[c]) then result += wchar2win(WideChar(ud.codepoint)); + end; + exit; + end; + end; + result := s; +end; + + +function win2utf (const s: AnsiString): AnsiString; +var + f, c: Integer; + + function utf8Encode (code: Integer): AnsiString; + begin + if (code < 0) or (code > $10FFFF) then begin result := '?'; exit; end; + if (code <= $7f) then + begin + result := Char(code and $ff); + end + else if (code <= $7FF) then + begin + result := Char($C0 or (code shr 6)); + result += Char($80 or (code and $3F)); + end + else if (code <= $FFFF) then + begin + result := Char($E0 or (code shr 12)); + result += Char($80 or ((code shr 6) and $3F)); + result += Char($80 or (code and $3F)); + end + else if (code <= $10FFFF) then + begin + result := Char($F0 or (code shr 18)); + result += Char($80 or ((code shr 12) and $3F)); + result += Char($80 or ((code shr 6) and $3F)); + result += Char($80 or (code and $3F)); + end + else + begin + result := '?'; + end; + end; + +begin + for f := 1 to Length(s) do + begin + if (Byte(s[f]) > 127) then + begin + result := ''; + for c := 1 to Length(s) do + begin + if (Byte(s[c]) < 128) then + begin + result += s[c]; + end + else + begin + result += utf8Encode(cp1251[Byte(s[c])-128]) + end; + end; + exit; + end; + end; + result := s; +end; + + +// ////////////////////////////////////////////////////////////////////////// // +function digitInBase (ch: AnsiChar; base: Integer): Integer; +begin + result := -1; + if (base < 1) or (base > 36) then exit; + if (ch < '0') then exit; + if (base <= 10) then + begin + if (Integer(ch) >= 48+base) then exit; + result := Integer(ch)-48; + end + else + begin + if (ch >= '0') and (ch <= '9') then begin result := Integer(ch)-48; exit; end; + if (ch >= 'a') and (ch <= 'z') then Dec(ch, 32); // poor man's tolower() + if (ch < 'A') or (Integer(ch) >= 65+(base-10)) then exit; + result := Integer(ch)-65+10; + end; +end; + +// ////////////////////////////////////////////////////////////////////////// // +function quoteStr (const s: AnsiString): AnsiString; + + function squote (const s: AnsiString): AnsiString; + var + f: Integer; + begin + result := ''''; + for f := 1 to Length(s) do + begin + if (s[f] = '''') then result += ''''; + result += s[f]; + end; + result += ''''; + end; + + function dquote (const s: AnsiString): AnsiString; + var + f: Integer; + ch: AnsiChar; + begin + result := '"'; + for f := 1 to Length(s) do + begin + ch := s[f]; + if (ch = #0) then result += '\z' + else if (ch = #9) then result += '\t' + else if (ch = #10) then result += '\n' + else if (ch = #13) then result += '\r' + else if (ch = #27) then result += '\e' + else if (ch < ' ') or (ch = #127) then + begin + result += '\x'; + result += LowerCase(IntToHex(Integer(ch), 2)); + end + else if (ch = '"') or (ch = '\') then + begin + result += '\'; + result += ch; + end + else + begin + result += ch; + end; + end; + result += '"'; + end; + +var + needSingle: Boolean = false; + f: Integer; +begin + for f := 1 to Length(s) do + begin + if (s[f] = '''') then begin needSingle := true; continue; end; + if (s[f] < ' ') or (s[f] = #127) then begin result := dquote(s); exit; end; + end; + if needSingle then result := squote(s) else result := ''''+s+''''; +end; + + +// ////////////////////////////////////////////////////////////////////////// // function hasWadExtension (fn: AnsiString): Boolean; begin fn := ExtractFileExt(fn); @@ -357,7 +659,7 @@ begin // remove trailing slashes again while (length(npt) > 0) and ((npt[1] = '/') or (npt[1] = '\')) do Delete(npt, 1, 1); wantdir := lastIsDir or (length(npt) > 0); // do we want directory here? - writeln(Format('0: npt=[%s]; newname=[%s]; curname=[%s]; wantdir=%d', [npt, newname, curname, Integer(wantdir)])); + //writeln(Format('npt=[%s]; newname=[%s]; curname=[%s]; wantdir=%d', [npt, newname, curname, Integer(wantdir)])); // try the easiest case first attr := FileGetAttr(newname+curname); if attr <> -1 then @@ -365,13 +667,12 @@ begin if wantdir = ((attr and faDirectory) <> 0) then begin // i found her! - writeln(Format('3: npt=[%s]; newname=[%s]; curname=[%s]; wantdir=%d; found=tan', [npt, newname, curname, Integer(wantdir)])); newname := newname+curname; if wantdir then newname := newname+'/'; continue; end; end; - writeln(Format('1: npt=[%s]; newname=[%s]; curname=[%s]; wantdir=%d', [npt, newname, curname, Integer(wantdir)])); + //writeln(Format('npt=[%s]; newname=[%s]; curname=[%s]; wantdir=%d', [npt, newname, curname, Integer(wantdir)])); // alas, either not found, or invalid attributes foundher := false; try @@ -389,28 +690,15 @@ begin finally FindClose(sr); end; - if (foundher) then - begin - writeln(Format('2: npt=[%s]; newname=[%s]; curname=[%s]; wantdir=%d; found=tan', [npt, newname, curname, Integer(wantdir)])); - end - else - begin - writeln(Format('2: npt=[%s]; newname=[%s]; curname=[%s]; wantdir=%d; found=ona', [npt, newname, curname, Integer(wantdir)])); - end; if not foundher then begin newname := ''; result := false; break; end; end; - writeln(Format('4: npt=[%s]; newname=[%s]; curname=[%s]; wantdir=%d; found=ona', [npt, newname, curname, Integer(wantdir)])); if result then pathname := newname; end; -function findFileCIStr (pathname: AnsiString; lastIsDir: Boolean=false): AnsiString; -var - s: AnsiString; +function findFileCIStr (pathname: AnsiString): AnsiString; begin - s := pathname; - if not findFileCI(s, lastIsDir) then s := pathname; - writeln(Format('***: pathname=[%s]; s=[%s]', [pathname, s])); - result := s; + result := pathname; + findFileCI(result); end; function openDiskFileRO (pathname: AnsiString): TStream; @@ -545,5 +833,5 @@ function readLongIntBE (st: TStream): LongInt; begin readIntegerBE(st, @result, function readInt64BE (st: TStream): Int64; begin readIntegerBE(st, @result, 8); end; function readUInt64BE (st: TStream): UInt64; begin readIntegerBE(st, @result, 8); end; - end. +