From 702b2acc2df232119ee6274711134f1d2be19c76 Mon Sep 17 00:00:00 2001 From: Ketmar Dark Date: Sun, 27 Aug 2017 11:24:13 +0300 Subject: [PATCH] moved some utility functions from xparser to utils --- src/shared/utils.pas | 239 +++++++++++++++++++++++++++++++++++++++++ src/shared/xdynrec.pas | 12 +-- src/shared/xparser.pas | 201 ---------------------------------- 3 files changed, 245 insertions(+), 207 deletions(-) diff --git a/src/shared/utils.pas b/src/shared/utils.pas index 36d59ff..c182da1 100644 --- a/src/shared/utils.pas +++ b/src/shared/utils.pas @@ -22,6 +22,36 @@ uses SysUtils, Classes; +// ////////////////////////////////////////////////////////////////////////// // +type + TUtf8DecoderFast = packed record + public + const Replacement = $FFFD; // replacement char for invalid unicode + const Accept = 0; + const Reject = 12; + + private + state: LongWord; + + public + codepoint: LongWord; // decoded codepoint (valid only when decoder is in "complete" state) + + public + constructor Create (v: Boolean{fuck you, fpc}); + + procedure reset (); inline; + + function complete (): Boolean; inline; // is current character complete? take `codepoint` then + function invalid (): Boolean; inline; + function completeOrInvalid (): Boolean; inline; + + // process one byte, return `true` if codepoint is ready + function decode (b: Byte): Boolean; inline; overload; + function decode (c: AnsiChar): Boolean; inline; overload; + end; + + +// ////////////////////////////////////////////////////////////////////////// // // does filename have one of ".wad", ".pk3", ".zip" extensions? function hasWadExtension (fn: AnsiString): Boolean; @@ -98,10 +128,219 @@ type // returns formatted string if `writerCB` is `nil`, empty string otherwise function formatstrf (const fmt: AnsiString; args: array of const; writerCB: TFormatStrFCallback=nil): AnsiString; +function wchar2win (wc: WideChar): AnsiChar; inline; +function utf2win (const s: AnsiString): AnsiString; +function digitInBase (ch: AnsiChar; base: Integer): Integer; + +// returns string in single or double quotes +// single quotes supports only pascal-style '' for single quote char +// double quotes supports c-style escapes +// function will select quote mode automatically +function quoteStr (const s: AnsiString): AnsiString; + implementation +var + wc2shitmap: array[0..65535] of AnsiChar; + wc2shitmapInited: Boolean = false; + + +// ////////////////////////////////////////////////////////////////////////// // +procedure initShitMap (); +const + cp1251: array[0..127] of Word = ( + $0402,$0403,$201A,$0453,$201E,$2026,$2020,$2021,$20AC,$2030,$0409,$2039,$040A,$040C,$040B,$040F, + $0452,$2018,$2019,$201C,$201D,$2022,$2013,$2014,$003F,$2122,$0459,$203A,$045A,$045C,$045B,$045F, + $00A0,$040E,$045E,$0408,$00A4,$0490,$00A6,$00A7,$0401,$00A9,$0404,$00AB,$00AC,$00AD,$00AE,$0407, + $00B0,$00B1,$0406,$0456,$0491,$00B5,$00B6,$00B7,$0451,$2116,$0454,$00BB,$0458,$0405,$0455,$0457, + $0410,$0411,$0412,$0413,$0414,$0415,$0416,$0417,$0418,$0419,$041A,$041B,$041C,$041D,$041E,$041F, + $0420,$0421,$0422,$0423,$0424,$0425,$0426,$0427,$0428,$0429,$042A,$042B,$042C,$042D,$042E,$042F, + $0430,$0431,$0432,$0433,$0434,$0435,$0436,$0437,$0438,$0439,$043A,$043B,$043C,$043D,$043E,$043F, + $0440,$0441,$0442,$0443,$0444,$0445,$0446,$0447,$0448,$0449,$044A,$044B,$044C,$044D,$044E,$044F + ); +var + f: Integer; +begin + for f := 0 to High(wc2shitmap) do wc2shitmap[f] := '?'; + for f := 0 to 127 do wc2shitmap[f] := AnsiChar(f); + for f := 0 to 127 do wc2shitmap[cp1251[f]] := AnsiChar(f+128); + wc2shitmapInited := true; +end; + + +// ////////////////////////////////////////////////////////////////////////// // +// fast state-machine based UTF-8 decoder; using 8 bytes of memory +// code points from invalid range will never be valid, this is the property of the state machine +const + // see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + utf8dfa: array[0..$16c-1] of Byte = ( + // maps bytes to character classes + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 00-0f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 10-1f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 20-2f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 30-3f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 40-4f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 50-5f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 60-6f + $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 70-7f + $01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01, // 80-8f + $09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09, // 90-9f + $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07, // a0-af + $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07, // b0-bf + $08,$08,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02, // c0-cf + $02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02, // d0-df + $0a,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$04,$03,$03, // e0-ef + $0b,$06,$06,$06,$05,$08,$08,$08,$08,$08,$08,$08,$08,$08,$08,$08, // f0-ff + // maps a combination of a state of the automaton and a character class to a state + $00,$0c,$18,$24,$3c,$60,$54,$0c,$0c,$0c,$30,$48,$0c,$0c,$0c,$0c, // 100-10f + $0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$00,$0c,$0c,$0c,$0c,$0c,$00, // 110-11f + $0c,$00,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$18,$0c,$18,$0c,$0c, // 120-12f + $0c,$0c,$0c,$0c,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$18,$0c,$0c, // 130-13f + $0c,$0c,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$24, // 140-14f + $0c,$24,$0c,$0c,$0c,$24,$0c,$0c,$0c,$0c,$0c,$24,$0c,$24,$0c,$0c, // 150-15f + $0c,$24,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c); + + +// ////////////////////////////////////////////////////////////////////////// // +constructor TUtf8DecoderFast.Create (v: Boolean{fuck you, fpc}); begin state := Accept; codepoint := 0; end; + +procedure TUtf8DecoderFast.reset (); inline; begin state := Accept; codepoint := 0; end; + +function TUtf8DecoderFast.complete (): Boolean; inline; begin result := (state = Accept); end; +function TUtf8DecoderFast.invalid (): Boolean; inline; begin result := (state = Reject); end; +function TUtf8DecoderFast.completeOrInvalid (): Boolean; inline; begin result := (state = Accept) or (state = Reject); end; + +function TUtf8DecoderFast.decode (c: AnsiChar): Boolean; inline; overload; begin result := decode(Byte(c)); end; + +function TUtf8DecoderFast.decode (b: Byte): Boolean; inline; overload; +var + tp: LongWord; +begin + if (state = Reject) then begin state := Accept; codepoint := 0; end; + tp := utf8dfa[b]; + if (state <> Accept) then codepoint := (b and $3f) or (codepoint shl 6) else codepoint := ($ff shr tp) and b; + state := utf8dfa[256+state+tp]; + if (state = Reject) then begin codepoint := Replacement; state := Accept; end; + result := (state = Accept); +end; + + +// ////////////////////////////////////////////////////////////////////////// // +function wchar2win (wc: WideChar): AnsiChar; inline; +begin + if not wc2shitmapInited then initShitMap(); + if (LongWord(wc) > 65535) then result := '?' else result := wc2shitmap[LongWord(wc)]; +end; + + +// ////////////////////////////////////////////////////////////////////////// // +function utf2win (const s: AnsiString): AnsiString; +var + f, c: Integer; + ud: TUtf8DecoderFast; +begin + for f := 1 to Length(s) do + begin + if (Byte(s[f]) > 127) then + begin + ud := TUtf8DecoderFast.Create(true); + result := ''; + for c := 1 to Length(s) do + begin + if ud.decode(s[c]) then result += wchar2win(WideChar(ud.codepoint)); + end; + exit; + end; + end; + result := s; +end; + + +// ////////////////////////////////////////////////////////////////////////// // +function digitInBase (ch: AnsiChar; base: Integer): Integer; +begin + result := -1; + if (base < 1) or (base > 36) then exit; + if (ch < '0') then exit; + if (base <= 10) then + begin + if (Integer(ch) >= 48+base) then exit; + result := Integer(ch)-48; + end + else + begin + if (ch >= '0') and (ch <= '9') then begin result := Integer(ch)-48; exit; end; + if (ch >= 'a') and (ch <= 'z') then Dec(ch, 32); // poor man's tolower() + if (ch < 'A') or (Integer(ch) >= 65+(base-10)) then exit; + result := Integer(ch)-65+10; + end; +end; + + +// ////////////////////////////////////////////////////////////////////////// // +function quoteStr (const s: AnsiString): AnsiString; + + function squote (const s: AnsiString): AnsiString; + var + f: Integer; + begin + result := ''''; + for f := 1 to Length(s) do + begin + if (s[f] = '''') then result += ''''; + result += s[f]; + end; + result += ''''; + end; + + function dquote (const s: AnsiString): AnsiString; + var + f: Integer; + ch: AnsiChar; + begin + result := '"'; + for f := 1 to Length(s) do + begin + ch := s[f]; + if (ch = #0) then result += '\z' + else if (ch = #9) then result += '\t' + else if (ch = #10) then result += '\n' + else if (ch = #13) then result += '\r' + else if (ch = #27) then result += '\e' + else if (ch < ' ') or (ch = #127) then + begin + result += '\x'; + result += LowerCase(IntToHex(Integer(ch), 2)); + end + else if (ch = '"') or (ch = '\') then + begin + result += '\'; + result += ch; + end + else + begin + result += ch; + end; + end; + result += '"'; + end; + +var + needSingle: Boolean = false; + f: Integer; +begin + for f := 1 to Length(s) do + begin + if (s[f] = '''') then begin needSingle := true; continue; end; + if (s[f] < ' ') or (s[f] = #127) then begin result := dquote(s); exit; end; + end; + if needSingle then result := squote(s) else result := ''''+s+''''; +end; + + +// ////////////////////////////////////////////////////////////////////////// // function hasWadExtension (fn: AnsiString): Boolean; begin fn := ExtractFileExt(fn); diff --git a/src/shared/xdynrec.pas b/src/shared/xdynrec.pas index f06df08..2aec726 100644 --- a/src/shared/xdynrec.pas +++ b/src/shared/xdynrec.pas @@ -504,7 +504,7 @@ end; function TDynField.definition (): AnsiString; begin - result := mPasName+' is '+TTextParser.quote(mName)+' type '; + result := mPasName+' is '+quoteStr(mName)+' type '; result += getTypeName(mType); if (mMaxDim >= 0) then result += Format('[%d]', [mMaxDim]); if (mRecOfs >= 0) then result += Format(' offset %d', [mRecOfs]); @@ -679,7 +679,7 @@ begin else if (fldtype = 'trigdata') then mType := TType.TTrigData else raise Exception.Create(Format('field ''%s'' has invalid type ''%s''', [fldname, fldtype])); - if hasdefStr then self.mDefUnparsed := TTextParser.quote(defstr) + if hasdefStr then self.mDefUnparsed := quoteStr(defstr) else if hasdefInt then self.mDefUnparsed := Format('%d', [defint]) else if hasdefId then self.mDefUnparsed := defstr; @@ -795,7 +795,7 @@ begin else begin if (Length(mSVal) > mMaxDim) then raise Exception.Create(Format('invalid string size definition for field ''%s''', [mName])); - s := utfTo1251(mSVal); + s := utf2win(mSVal); if (Length(s) > 0) then st.WriteBuffer(PChar(s)^, Length(s)); for f := Length(s) to mMaxDim do writeInt(st, Byte(0)); end; @@ -951,7 +951,7 @@ begin TType.TChar: begin if (mMaxDim = 0) then raise Exception.Create(Format('invalid string size definition for field ''%s''', [mName])); - wr.put(TTextParser.quote(mSVal)); + wr.put(quoteStr(mSVal)); wr.put(';'#10); exit; end; @@ -967,7 +967,7 @@ begin end; TType.TString: begin - wr.put(TTextParser.quote(mSVal)); + wr.put(quoteStr(mSVal)); wr.put(';'#10); exit; end; @@ -1480,7 +1480,7 @@ begin else begin // record - result := mPasName+' is '+TTextParser.quote(mName); + result := mPasName+' is '+quoteStr(mName); if (mSize >= 0) then result += Format(' size %d bytes', [mSize]); if mHeader then result += ' header'; end; diff --git a/src/shared/xparser.pas b/src/shared/xparser.pas index a7b8a51..fe17d0c 100644 --- a/src/shared/xparser.pas +++ b/src/shared/xparser.pas @@ -22,35 +22,6 @@ uses Classes; -// ////////////////////////////////////////////////////////////////////////// // -type - TUtf8DecoderFast = packed record - public - const Replacement = $FFFD; // replacement char for invalid unicode - const Accept = 0; - const Reject = 12; - - private - state: LongWord; - - public - codepoint: LongWord; // decoded codepoint (valid only when decoder is in "complete" state) - - public - constructor Create (v: Boolean{fuck you, fpc}); - - procedure reset (); inline; - - function complete (): Boolean; inline; // is current character complete? take `codepoint` then - function invalid (): Boolean; inline; - function completeOrInvalid (): Boolean; inline; - - // process one byte, return `true` if codepoint is ready - function decode (b: Byte): Boolean; inline; overload; - function decode (c: AnsiChar): Boolean; inline; overload; - end; - - // ////////////////////////////////////////////////////////////////////////// // type TTextParser = class @@ -85,9 +56,6 @@ type procedure warmup (); virtual; // called in constructor to warm up the system procedure loadNextChar (); virtual; abstract; // loads next char into mNextChar; #0 means 'eof' - public - class function quote (const s: AnsiString): AnsiString; - public constructor Create (loadToken: Boolean=true); destructor Destroy (); override; @@ -196,13 +164,6 @@ type end; -// ////////////////////////////////////////////////////////////////////////// // -function wcharTo1251 (wc: WideChar): AnsiChar; inline; -function utfTo1251 (const s: AnsiString): AnsiString; - -function digitInBase (ch: AnsiChar; base: Integer): Integer; - - implementation uses @@ -246,168 +207,6 @@ begin end; -// ////////////////////////////////////////////////////////////////////////// // -// fast state-machine based UTF-8 decoder; using 8 bytes of memory -// code points from invalid range will never be valid, this is the property of the state machine -const - // see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ - utf8dfa: array[0..$16c-1] of Byte = ( - // maps bytes to character classes - $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 00-0f - $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 10-1f - $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 20-2f - $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 30-3f - $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 40-4f - $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 50-5f - $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 60-6f - $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 70-7f - $01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01, // 80-8f - $09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09, // 90-9f - $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07, // a0-af - $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07, // b0-bf - $08,$08,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02, // c0-cf - $02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02, // d0-df - $0a,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$04,$03,$03, // e0-ef - $0b,$06,$06,$06,$05,$08,$08,$08,$08,$08,$08,$08,$08,$08,$08,$08, // f0-ff - // maps a combination of a state of the automaton and a character class to a state - $00,$0c,$18,$24,$3c,$60,$54,$0c,$0c,$0c,$30,$48,$0c,$0c,$0c,$0c, // 100-10f - $0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$00,$0c,$0c,$0c,$0c,$0c,$00, // 110-11f - $0c,$00,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$18,$0c,$18,$0c,$0c, // 120-12f - $0c,$0c,$0c,$0c,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$18,$0c,$0c, // 130-13f - $0c,$0c,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$24, // 140-14f - $0c,$24,$0c,$0c,$0c,$24,$0c,$0c,$0c,$0c,$0c,$24,$0c,$24,$0c,$0c, // 150-15f - $0c,$24,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c); - - -// ////////////////////////////////////////////////////////////////////////// // -constructor TUtf8DecoderFast.Create (v: Boolean{fuck you, fpc}); begin state := Accept; codepoint := 0; end; - -procedure TUtf8DecoderFast.reset (); inline; begin state := Accept; codepoint := 0; end; - -function TUtf8DecoderFast.complete (): Boolean; inline; begin result := (state = Accept); end; -function TUtf8DecoderFast.invalid (): Boolean; inline; begin result := (state = Reject); end; -function TUtf8DecoderFast.completeOrInvalid (): Boolean; inline; begin result := (state = Accept) or (state = Reject); end; - -function TUtf8DecoderFast.decode (c: AnsiChar): Boolean; inline; overload; begin result := decode(Byte(c)); end; - -function TUtf8DecoderFast.decode (b: Byte): Boolean; inline; overload; -var - tp: LongWord; -begin - if (state = Reject) then begin state := Accept; codepoint := 0; end; - tp := utf8dfa[b]; - if (state <> Accept) then codepoint := (b and $3f) or (codepoint shl 6) else codepoint := ($ff shr tp) and b; - state := utf8dfa[256+state+tp]; - if (state = Reject) then begin codepoint := Replacement; state := Accept; end; - result := (state = Accept); -end; - - -// ////////////////////////////////////////////////////////////////////////// // -function utfTo1251 (const s: AnsiString): AnsiString; -var - f, c: Integer; - ud: TUtf8DecoderFast; -begin - for f := 1 to Length(s) do - begin - if (Byte(s[f]) > 127) then - begin - ud := TUtf8DecoderFast.Create(true); - result := ''; - for c := 1 to Length(s) do - begin - if ud.decode(s[c]) then result += wcharTo1251(WideChar(ud.codepoint)); - end; - exit; - end; - end; - result := s; -end; - - -// ////////////////////////////////////////////////////////////////////////// // -function digitInBase (ch: AnsiChar; base: Integer): Integer; -begin - result := -1; - if (base < 1) or (base > 36) then exit; - if (ch < '0') then exit; - if (base <= 10) then - begin - if (Integer(ch) >= 48+base) then exit; - result := Integer(ch)-48; - end - else - begin - if (ch >= '0') and (ch <= '9') then begin result := Integer(ch)-48; exit; end; - if (ch >= 'a') and (ch <= 'z') then Dec(ch, 32); // poor man's tolower() - if (ch < 'A') or (Integer(ch) >= 65+(base-10)) then exit; - result := Integer(ch)-65+10; - end; -end; - - -// ////////////////////////////////////////////////////////////////////////// // -class function TTextParser.quote (const s: AnsiString): AnsiString; - - function squote (const s: AnsiString): AnsiString; - var - f: Integer; - begin - result := ''''; - for f := 1 to Length(s) do - begin - if (s[f] = '''') then result += ''''; - result += s[f]; - end; - result += ''''; - end; - - function dquote (const s: AnsiString): AnsiString; - var - f: Integer; - ch: AnsiChar; - begin - result := '"'; - for f := 1 to Length(s) do - begin - ch := s[f]; - if (ch = #0) then result += '\z' - else if (ch = #9) then result += '\t' - else if (ch = #10) then result += '\n' - else if (ch = #13) then result += '\r' - else if (ch = #27) then result += '\e' - else if (ch < ' ') or (ch = #127) then - begin - result += '\x'; - result += LowerCase(IntToHex(Integer(ch), 2)); - end - else if (ch = '"') or (ch = '\') then - begin - result += '\'; - result += ch; - end - else - begin - result += ch; - end; - end; - result += '"'; - end; - -var - needSingle: Boolean = false; - f: Integer; -begin - for f := 1 to Length(s) do - begin - if (s[f] = '''') then begin needSingle := true; continue; end; - if (s[f] < ' ') or (s[f] = #127) then begin result := dquote(s); exit; end; - end; - if needSingle then result := squote(s) else result := ''''+s+''''; -end; - - // ////////////////////////////////////////////////////////////////////////// // constructor TTextParser.Create (loadToken: Boolean=true); begin -- 2.29.2