1 (* Copyright (C) DooM 2D:Forever Developers
3 * This program is free software: you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation, either version 3 of the License, or
6 * (at your option) any later version.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 {$INCLUDE a_modes.inc}
22 // ////////////////////////////////////////////////////////////////////////// //
24 TUtf8DecoderFast
= packed record
26 const Replacement
= $FFFD; // replacement char for invalid unicode
34 codepoint
: LongWord; // decoded codepoint (valid only when decoder is in "complete" state)
37 constructor Create (v
: Boolean{fuck you, fpc});
39 procedure reset (); inline;
41 function complete (): Boolean; inline; // is current character complete? take `codepoint` then
42 function invalid (): Boolean; inline;
43 function completeOrInvalid (): Boolean; inline;
45 // process one byte, return `true` if codepoint is ready
46 function decode (b
: Byte): Boolean; inline; overload
;
47 function decode (c
: AnsiChar): Boolean; inline; overload
;
51 // ////////////////////////////////////////////////////////////////////////// //
60 //TTFloat = 3; // not yet
65 TTBegin
= 8; // left curly
66 TTEnd
= 9; // right curly
67 TTDelim
= 10; // other delimiters
71 mCurChar
, mNextChar
: AnsiChar;
73 mAllowSignedNumbers
: Boolean; // internal control
75 mTokLine
, mTokCol
: Integer; // token start
77 mTokStr
: AnsiString; // string or identifier
78 mTokChar
: AnsiChar; // for delimiters
82 procedure warmup (); virtual; abstract; // called in constructor to warm up the system
83 procedure loadNextChar (); virtual; abstract; // loads next char into mNextChar; #0 means 'eof'
86 class function quote (const s
: AnsiString): AnsiString;
89 constructor Create (loadToken
: Boolean=true);
90 destructor Destroy (); override;
92 function isEOF (): Boolean; inline;
94 function skipChar (): Boolean; // returns `false` on eof
96 function skipBlanks (): Boolean; // ...and comments; returns `false` on eof
98 function skipToken (): Boolean; // returns `false` on eof
100 function expectId (): AnsiString;
101 procedure expectId (const aid
: AnsiString);
102 function eatId (const aid
: AnsiString): Boolean;
104 function expectStr (allowEmpty
: Boolean=false): AnsiString;
105 function expectInt (): Integer;
107 procedure expectTT (ttype
: Integer);
108 function eatTT (ttype
: Integer): Boolean;
110 function expectDelim (const ch
: AnsiChar): AnsiChar;
111 function eatDelim (const ch
: AnsiChar): Boolean;
114 property col
: Integer read mCol
;
115 property line
: Integer read mLine
;
117 property curChar
: AnsiChar read mCurChar
;
118 property nextChar
: AnsiChar read mNextChar
;
121 property tokCol
: Integer read mTokCol
;
122 property tokLine
: Integer read mTokLine
;
124 property tokType
: Integer read mTokType
; // see TTXXX constants
125 property tokStr
: AnsiString read mTokStr
; // string or identifier
126 property tokChar
: AnsiChar read mTokChar
; // for delimiters
127 property tokInt
: Integer read mTokInt
;
131 // ////////////////////////////////////////////////////////////////////////// //
133 TFileTextParser
= class(TTextParser
)
138 procedure warmup (); override; // called in constructor to warm up the system
139 procedure loadNextChar (); override; // loads next char into mNextChar; #0 means 'eof'
142 constructor Create (const fname
: AnsiString; loadToken
: Boolean=true);
143 destructor Destroy (); override;
146 TStrTextParser
= class(TTextParser
)
152 procedure warmup (); override; // called in constructor to warm up the system
153 procedure loadNextChar (); override; // loads next char into mNextChar; #0 means 'eof'
156 constructor Create (const astr
: AnsiString; loadToken
: Boolean=true);
157 destructor Destroy (); override;
161 // ////////////////////////////////////////////////////////////////////////// //
168 procedure putBuf (constref buf
; len
: SizeUInt
); virtual; abstract;
171 constructor Create ();
173 procedure put (const s
: AnsiString); overload
;
174 procedure put (v
: Byte); overload
;
175 procedure put (v
: Integer); overload
;
176 procedure put (const fmt
: AnsiString; args
: array of const); overload
;
177 procedure putIndent ();
179 procedure unindent ();
183 // ////////////////////////////////////////////////////////////////////////// //
185 TFileTextWriter
= class(TTextWriter
)
190 procedure putBuf (constref buf
; len
: SizeUInt
); override;
193 constructor Create (const fname
: AnsiString);
194 destructor Destroy (); override;
198 // ////////////////////////////////////////////////////////////////////////// //
199 function wcharTo1251 (wc
: WideChar): AnsiChar; inline;
200 function utfTo1251 (const s
: AnsiString): AnsiString;
202 function digitInBase (ch
: AnsiChar; base
: Integer): Integer;
212 wc2shitmap
: array[0..65535] of AnsiChar;
213 wc2shitmapInited
: Boolean = false;
216 // ////////////////////////////////////////////////////////////////////////// //
217 procedure initShitMap ();
219 cp1251
: array[0..127] of Word = (
220 $0402,$0403,$201A,$0453,$201E,$2026,$2020,$2021,$20AC,$2030,$0409,$2039,$040A,$040C,$040B,$040F,
221 $0452,$2018,$2019,$201C,$201D,$2022,$2013,$2014,$003F,$2122,$0459,$203A,$045A,$045C,$045B,$045F,
222 $00A0,$040E,$045E,$0408,$00A4,$0490,$00A6,$00A7,$0401,$00A9,$0404,$00AB,$00AC,$00AD,$00AE,$0407,
223 $00B0,$00B1,$0406,$0456,$0491,$00B5,$00B6,$00B7,$0451,$2116,$0454,$00BB,$0458,$0405,$0455,$0457,
224 $0410,$0411,$0412,$0413,$0414,$0415,$0416,$0417,$0418,$0419,$041A,$041B,$041C,$041D,$041E,$041F,
225 $0420,$0421,$0422,$0423,$0424,$0425,$0426,$0427,$0428,$0429,$042A,$042B,$042C,$042D,$042E,$042F,
226 $0430,$0431,$0432,$0433,$0434,$0435,$0436,$0437,$0438,$0439,$043A,$043B,$043C,$043D,$043E,$043F,
227 $0440,$0441,$0442,$0443,$0444,$0445,$0446,$0447,$0448,$0449,$044A,$044B,$044C,$044D,$044E,$044F
232 for f
:= 0 to High(wc2shitmap
) do wc2shitmap
[f
] := '?';
233 for f
:= 0 to 127 do wc2shitmap
[f
] := AnsiChar(f
);
234 for f
:= 0 to 127 do wc2shitmap
[cp1251
[f
]] := AnsiChar(f
+128);
235 wc2shitmapInited
:= true;
239 // ////////////////////////////////////////////////////////////////////////// //
240 // TODO: make a hash or something
241 function wcharTo1251 (wc
: WideChar): AnsiChar; inline;
243 if not wc2shitmapInited
then initShitMap();
244 if (LongWord(wc
) > 65535) then result
:= '?' else result
:= wc2shitmap
[LongWord(wc
)];
248 // ////////////////////////////////////////////////////////////////////////// //
249 // fast state-machine based UTF-8 decoder; using 8 bytes of memory
250 // code points from invalid range will never be valid, this is the property of the state machine
252 // see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
253 utf8dfa
: array[0..$16c-1] of Byte = (
254 // maps bytes to character classes
255 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 00-0f
256 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 10-1f
257 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 20-2f
258 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 30-3f
259 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 40-4f
260 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 50-5f
261 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 60-6f
262 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 70-7f
263 $01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01, // 80-8f
264 $09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09, // 90-9f
265 $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07, // a0-af
266 $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07, // b0-bf
267 $08,$08,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02, // c0-cf
268 $02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02, // d0-df
269 $0a,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$04,$03,$03, // e0-ef
270 $0b,$06,$06,$06,$05,$08,$08,$08,$08,$08,$08,$08,$08,$08,$08,$08, // f0-ff
271 // maps a combination of a state of the automaton and a character class to a state
272 $00,$0c,$18,$24,$3c,$60,$54,$0c,$0c,$0c,$30,$48,$0c,$0c,$0c,$0c, // 100-10f
273 $0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$00,$0c,$0c,$0c,$0c,$0c,$00, // 110-11f
274 $0c,$00,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$18,$0c,$18,$0c,$0c, // 120-12f
275 $0c,$0c,$0c,$0c,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$18,$0c,$0c, // 130-13f
276 $0c,$0c,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$24, // 140-14f
277 $0c,$24,$0c,$0c,$0c,$24,$0c,$0c,$0c,$0c,$0c,$24,$0c,$24,$0c,$0c, // 150-15f
278 $0c,$24,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c);
281 // ////////////////////////////////////////////////////////////////////////// //
282 constructor TUtf8DecoderFast
.Create (v
: Boolean{fuck you, fpc}); begin state
:= Accept
; codepoint
:= 0; end;
284 procedure TUtf8DecoderFast
.reset (); inline; begin state
:= Accept
; codepoint
:= 0; end;
286 function TUtf8DecoderFast
.complete (): Boolean; inline; begin result
:= (state
= Accept
); end;
287 function TUtf8DecoderFast
.invalid (): Boolean; inline; begin result
:= (state
= Reject
); end;
288 function TUtf8DecoderFast
.completeOrInvalid (): Boolean; inline; begin result
:= (state
= Accept
) or (state
= Reject
); end;
290 function TUtf8DecoderFast
.decode (c
: AnsiChar): Boolean; inline; overload
; begin result
:= decode(Byte(c
)); end;
292 function TUtf8DecoderFast
.decode (b
: Byte): Boolean; inline; overload
;
296 if (state
= Reject
) then begin state
:= Accept
; codepoint
:= 0; end;
298 if (state
<> Accept
) then codepoint
:= (b
and $3f) or (codepoint
shl 6) else codepoint
:= ($ff shr tp
) and b
;
299 state
:= utf8dfa
[256+state
+tp
];
300 if (state
= Reject
) then begin codepoint
:= Replacement
; state
:= Accept
; end;
301 result
:= (state
= Accept
);
305 // ////////////////////////////////////////////////////////////////////////// //
306 function utfTo1251 (const s
: AnsiString): AnsiString;
309 ud
: TUtf8DecoderFast
;
311 for f
:= 1 to Length(s
) do
313 if (Byte(s
[f
]) > 127) then
315 ud
:= TUtf8DecoderFast
.Create(true);
317 for c
:= 1 to Length(s
) do
319 if ud
.decode(s
[c
]) then result
+= wcharTo1251(WideChar(ud
.codepoint
));
328 // ////////////////////////////////////////////////////////////////////////// //
329 function digitInBase (ch
: AnsiChar; base
: Integer): Integer;
332 if (base
< 1) or (base
> 36) then exit
;
333 if (ch
< '0') then exit
;
336 if (Integer(ch
) >= 48+base
) then exit
;
337 result
:= Integer(ch
)-48;
341 if (ch
>= '0') and (ch
<= '9') then begin result
:= Integer(ch
)-48; exit
; end;
342 if (ch
>= 'a') and (ch
<= 'z') then Dec(ch
, 32); // poor man's tolower()
343 if (ch
< 'A') or (Integer(ch
) >= 65+(base
-10)) then exit
;
344 result
:= Integer(ch
)-65+10;
349 // ////////////////////////////////////////////////////////////////////////// //
350 class function TTextParser
.quote (const s
: AnsiString): AnsiString;
352 function squote (const s
: AnsiString): AnsiString;
357 for f
:= 1 to Length(s
) do
359 if (s
[f
] = '''') then result
+= '''';
365 function dquote (const s
: AnsiString): AnsiString;
371 for f
:= 1 to Length(s
) do
374 if (ch
= #0) then result
+= '\z'
375 else if (ch
= #9) then result
+= '\t'
376 else if (ch
= #10) then result
+= '\n'
377 else if (ch
= #13) then result
+= '\r'
378 else if (ch
= #27) then result
+= '\e'
379 else if (ch
< ' ') or (ch
= #127) then
382 result
+= LowerCase(IntToHex(Integer(ch
), 2));
384 else if (ch
= '"') or (ch
= '\') then
398 needSingle
: Boolean = false;
401 for f
:= 1 to Length(s
) do
403 if (s
[f
] = '''') then begin needSingle
:= true; continue
; end;
404 if (s
[f
] < ' ') or (s
[f
] = #127) then begin result
:= dquote(s
); exit
; end;
406 if needSingle
then result
:= squote(s
) else result
:= ''''+s
+'''';
410 // ////////////////////////////////////////////////////////////////////////// //
411 constructor TTextParser
.Create (loadToken
: Boolean=true);
421 mAllowSignedNumbers
:= true;
422 warmup(); // change `mAllowSignedNumbers` there, if necessary
423 if loadToken
then skipToken();
427 destructor TTextParser
.Destroy ();
433 function TTextParser
.isEOF (): Boolean; inline; begin result
:= (mCurChar
= #0); end;
436 function TTextParser
.skipChar (): Boolean;
438 if (mCurChar
= #0) then begin result
:= false; exit
; end;
439 if (mCurChar
= #10) then begin mCol
:= 1; Inc(mLine
); end else Inc(mCol
);
440 mCurChar
:= mNextChar
;
441 if (mCurChar
= #0) then begin result
:= false; exit
; end;
444 if (mCurChar
= #13) then
446 if (mNextChar
= #10) then loadNextChar();
453 function TTextParser
.skipBlanks (): Boolean;
459 if (curChar
= '/') then
461 // single-line comment
462 if (nextChar
= '/') then
464 while not isEOF
and (curChar
<> #10) do skipChar();
465 skipChar(); // skip EOL
469 if (nextChar
= '*') then
471 // skip comment start
476 if (curChar
= '*') and (nextChar
= '/') then
487 // nesting multline comment
488 if (nextChar
= '+') then
490 // skip comment start
496 if (curChar
= '+') and (nextChar
= '/') then
502 if (level
= 0) then break
;
505 if (curChar
= '/') and (nextChar
= '+') then
507 // skip comment start
518 if (curChar
> ' ') then break
;
519 skipChar(); // skip blank
525 function TTextParser
.skipToken (): Boolean;
527 procedure parseInt ();
529 neg
: Boolean = false;
533 if mAllowSignedNumbers
then
535 if (curChar
= '+') or (curChar
= '-') then
537 neg
:= (curChar
= '-');
539 if (curChar
< '0') or (curChar
> '9') then
542 if (neg
) then mTokChar
:= '-' else mTokChar
:= '+';
547 if (curChar
= '0') then
563 if (base
< 0) then base
:= 10;
564 if (digitInBase(curChar
, base
) < 0) then raise Exception
.Create('invalid number');
566 mTokInt
:= 0; // just in case
569 n
:= digitInBase(curChar
, base
);
570 if (n
< 0) then break
;
572 if (n
< 0) or (n
< mTokInt
) then raise Exception
.Create('integer overflow');
576 // check for valid number end
579 if (curChar
= '.') then raise Exception
.Create('floating numbers aren''t supported yet');
580 if (curChar
= '_') or ((curChar
>= 'A') and (curChar
<= 'Z')) or ((curChar
>= 'a') and (curChar
<= 'z')) or (curChar
>= #128) then
582 raise Exception
.Create('invalid number');
585 if neg
then mTokInt
:= -mTokInt
;
588 procedure parseString ();
594 mTokStr
:= ''; // just in case
596 skipChar(); // skip starting quote
600 if (qch
= '"') and (curChar
= '\') then
602 if (nextChar
= #0) then raise Exception
.Create('unterminated string escape');
604 // skip backslash and escape type
613 'x', 'X': // hex escape
615 n
:= digitInBase(curChar
, 16);
616 if (n
< 0) then raise Exception
.Create('invalid hexstr escape');
618 if (digitInBase(curChar
, 16) > 0) then
620 n
:= n
*16+digitInBase(curChar
, 16);
623 mTokStr
+= AnsiChar(n
);
629 // duplicate single quote (pascal style)
630 if (qch
= '''') and (curChar
= '''') and (nextChar
= '''') then
638 if (curChar
= qch
) then
640 skipChar(); // skip ending quote
648 procedure parseId ();
651 mTokStr
:= ''; // just in case
652 while (curChar
= '_') or ((curChar
>= '0') and (curChar
<= '9')) or
653 ((curChar
>= 'A') and (curChar
<= 'Z')) or
654 ((curChar
>= 'a') and (curChar
<= 'z')) or
668 if not skipBlanks() then
682 if mAllowSignedNumbers
and ((curChar
= '+') or (curChar
= '-')) then begin parseInt(); exit
; end;
683 if (curChar
>= '0') and (curChar
<= '9') then begin parseInt(); exit
; end;
686 if (curChar
= '"') or (curChar
= '''') then begin parseString(); exit
; end;
689 if (curChar
= '_') or ((curChar
>= 'A') and (curChar
<= 'Z')) or ((curChar
>= 'a') and (curChar
<= 'z')) or (curChar
>= #128) then begin parseId(); exit
; end;
693 ',': mTokType
:= TTComma
;
694 ':': mTokType
:= TTColon
;
695 ';': mTokType
:= TTSemi
;
696 '{': mTokType
:= TTBegin
;
697 '}': mTokType
:= TTEnd
;
698 else mTokType
:= TTDelim
;
705 function TTextParser
.expectId (): AnsiString;
707 if (mTokType
<> TTId
) then raise Exception
.Create('identifier expected');
713 procedure TTextParser
.expectId (const aid
: AnsiString);
715 if (mTokType
<> TTId
) or (CompareText(mTokStr
, aid
) <> 0) then raise Exception
.Create('identifier '''+aid
+''' expected');
720 function TTextParser
.eatId (const aid
: AnsiString): Boolean;
723 if (mTokType
<> TTId
) or (CompareText(mTokStr
, aid
) <> 0) then exit
;
729 function TTextParser
.expectStr (allowEmpty
: Boolean=false): AnsiString;
731 if (mTokType
<> TTStr
) then raise Exception
.Create('string expected');
732 if (not allowEmpty
) and (Length(mTokStr
) = 0) then raise Exception
.Create('non-empty string expected');
738 function TTextParser
.expectInt (): Integer;
740 if (mTokType
<> TTInt
) then raise Exception
.Create('string expected');
746 procedure TTextParser
.expectTT (ttype
: Integer);
748 if (mTokType
<> ttype
) then raise Exception
.Create('unexpected token');
753 function TTextParser
.eatTT (ttype
: Integer): Boolean;
755 result
:= (mTokType
= ttype
);
756 if result
then skipToken();
760 function TTextParser
.expectDelim (const ch
: AnsiChar): AnsiChar;
762 if (mTokType
<> TTDelim
) then raise Exception
.Create(Format('delimiter ''%s'' expected', [ch
]));
768 function TTextParser
.eatDelim (const ch
: AnsiChar): Boolean;
771 if (mTokType
<> TTDelim
) or (mTokChar
<> ch
) then exit
;
777 // ////////////////////////////////////////////////////////////////////////// //
778 constructor TFileTextParser
.Create (const fname
: AnsiString; loadToken
: Boolean=true);
780 AssignFile(mFile
, fname
);
782 inherited Create(loadToken
);
786 destructor TFileTextParser
.Destroy ();
793 procedure TFileTextParser
.warmup ();
797 blockRead(mFile
, mCurChar
, 1, rd
);
798 if (rd
= 0) then begin mCurChar
:= #0; exit
; end;
799 if (mCurChar
= #0) then mCurChar
:= ' ';
804 procedure TFileTextParser
.loadNextChar ();
808 blockRead(mFile
, mNextChar
, 1, rd
);
809 if (rd
= 0) then begin mNextChar
:= #0; exit
; end;
810 if (mNextChar
= #0) then mNextChar
:= ' ';
814 // ////////////////////////////////////////////////////////////////////////// //
815 constructor TStrTextParser
.Create (const astr
: AnsiString; loadToken
: Boolean=true);
819 inherited Create(loadToken
);
823 destructor TStrTextParser
.Destroy ();
830 procedure TStrTextParser
.warmup ();
832 if (mPos
> Length(mStr
)) then
838 mCurChar
:= mStr
[mPos
]; Inc(mPos
);
839 if (mCurChar
= #0) then mCurChar
:= ' ';
844 procedure TStrTextParser
.loadNextChar ();
847 if (mPos
> Length(mStr
)) then exit
;
848 mNextChar
:= mStr
[mPos
]; Inc(mPos
);
849 if (mNextChar
= #0) then mNextChar
:= ' ';
853 // ////////////////////////////////////////////////////////////////////////// //
854 constructor TTextWriter
.Create (); begin mIndent
:= 0; end;
855 procedure TTextWriter
.put (const s
: AnsiString); overload
; begin if (Length(s
) > 0) then putBuf((@(s
[1]))^, Length(s
)); end;
856 procedure TTextWriter
.put (v
: Byte); overload
; begin put('%d', [v
]); end;
857 procedure TTextWriter
.put (v
: Integer); overload
; begin put('%d', [v
]); end;
858 procedure TTextWriter
.put (const fmt
: AnsiString; args
: array of const); overload
; begin put(formatstrf(fmt
, args
)); end;
859 procedure TTextWriter
.putIndent (); var f
: Integer; begin for f
:= 1 to mIndent
do put(' '); end;
860 procedure TTextWriter
.indent (); begin Inc(mIndent
, 2); end;
861 procedure TTextWriter
.unindent (); begin Dec(mIndent
, 2); end;
864 // ////////////////////////////////////////////////////////////////////////// //
865 constructor TFileTextWriter
.Create (const fname
: AnsiString);
867 AssignFile(mFile
, fname
);
873 destructor TFileTextWriter
.Destroy ();
879 procedure TFileTextWriter
.putBuf (constref buf
; len
: SizeUInt
);
887 BlockWrite(mFile
, pc
^, len
, wr
);
888 if (wr
<> len
) then raise Exception
.Create('write error');