1 (* Copyright (C) DooM 2D:Forever Developers
3 * This program is free software: you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation, either version 3 of the License, or
6 * (at your option) any later version.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 {$INCLUDE a_modes.inc}
25 // ////////////////////////////////////////////////////////////////////////// //
27 TUtf8DecoderFast
= packed record
29 const Replacement
= $FFFD; // replacement char for invalid unicode
37 codepoint
: LongWord; // decoded codepoint (valid only when decoder is in "complete" state)
40 constructor Create (v
: Boolean{fuck you, fpc});
42 procedure reset (); inline;
44 function complete (): Boolean; inline; // is current character complete? take `codepoint` then
45 function invalid (): Boolean; inline;
46 function completeOrInvalid (): Boolean; inline;
48 // process one byte, return `true` if codepoint is ready
49 function decode (b
: Byte): Boolean; inline; overload
;
50 function decode (c
: AnsiChar): Boolean; inline; overload
;
54 // ////////////////////////////////////////////////////////////////////////// //
63 //TTFloat = 3; // not yet
68 TTBegin
= 8; // left curly
69 TTEnd
= 9; // right curly
70 TTDelim
= 10; // other delimiters
74 mCurChar
, mNextChar
: AnsiChar;
76 mAllowSignedNumbers
: Boolean; // internal control
78 mTokLine
, mTokCol
: Integer; // token start
80 mTokStr
: AnsiString; // string or identifier
81 mTokChar
: AnsiChar; // for delimiters
85 procedure warmup (); virtual; // called in constructor to warm up the system
86 procedure loadNextChar (); virtual; abstract; // loads next char into mNextChar; #0 means 'eof'
89 class function quote (const s
: AnsiString): AnsiString;
92 constructor Create (loadToken
: Boolean=true);
93 destructor Destroy (); override;
95 function isEOF (): Boolean; inline;
97 function skipChar (): Boolean; // returns `false` on eof
99 function skipBlanks (): Boolean; // ...and comments; returns `false` on eof
101 function skipToken (): Boolean; // returns `false` on eof
103 function expectId (): AnsiString;
104 procedure expectId (const aid
: AnsiString);
105 function eatId (const aid
: AnsiString): Boolean;
107 function expectStr (allowEmpty
: Boolean=false): AnsiString;
108 function expectInt (): Integer;
110 procedure expectTT (ttype
: Integer);
111 function eatTT (ttype
: Integer): Boolean;
113 function expectDelim (const ch
: AnsiChar): AnsiChar;
114 function eatDelim (const ch
: AnsiChar): Boolean;
117 property col
: Integer read mCol
;
118 property line
: Integer read mLine
;
120 property curChar
: AnsiChar read mCurChar
;
121 property nextChar
: AnsiChar read mNextChar
;
124 property tokCol
: Integer read mTokCol
;
125 property tokLine
: Integer read mTokLine
;
127 property tokType
: Integer read mTokType
; // see TTXXX constants
128 property tokStr
: AnsiString read mTokStr
; // string or identifier
129 property tokChar
: AnsiChar read mTokChar
; // for delimiters
130 property tokInt
: Integer read mTokInt
;
134 // ////////////////////////////////////////////////////////////////////////// //
136 TFileTextParser
= class(TTextParser
)
141 procedure loadNextChar (); override; // loads next char into mNextChar; #0 means 'eof'
144 constructor Create (const fname
: AnsiString; loadToken
: Boolean=true);
145 destructor Destroy (); override;
148 TStrTextParser
= class(TTextParser
)
154 procedure loadNextChar (); override; // loads next char into mNextChar; #0 means 'eof'
157 constructor Create (const astr
: AnsiString; loadToken
: Boolean=true);
158 destructor Destroy (); override;
162 // ////////////////////////////////////////////////////////////////////////// //
169 procedure putBuf (constref buf
; len
: SizeUInt
); virtual; abstract;
172 constructor Create ();
174 procedure put (const s
: AnsiString); overload
;
175 procedure put (v
: Byte); overload
;
176 procedure put (v
: Integer); overload
;
177 procedure put (const fmt
: AnsiString; args
: array of const); overload
;
178 procedure putIndent ();
180 procedure unindent ();
184 // ////////////////////////////////////////////////////////////////////////// //
186 TFileTextWriter
= class(TTextWriter
)
191 procedure putBuf (constref buf
; len
: SizeUInt
); override;
194 constructor Create (const fname
: AnsiString);
195 destructor Destroy (); override;
199 // ////////////////////////////////////////////////////////////////////////// //
200 function wcharTo1251 (wc
: WideChar): AnsiChar; inline;
201 function utfTo1251 (const s
: AnsiString): AnsiString;
203 function digitInBase (ch
: AnsiChar; base
: Integer): Integer;
213 wc2shitmap
: array[0..65535] of AnsiChar;
214 wc2shitmapInited
: Boolean = false;
217 // ////////////////////////////////////////////////////////////////////////// //
218 procedure initShitMap ();
220 cp1251
: array[0..127] of Word = (
221 $0402,$0403,$201A,$0453,$201E,$2026,$2020,$2021,$20AC,$2030,$0409,$2039,$040A,$040C,$040B,$040F,
222 $0452,$2018,$2019,$201C,$201D,$2022,$2013,$2014,$003F,$2122,$0459,$203A,$045A,$045C,$045B,$045F,
223 $00A0,$040E,$045E,$0408,$00A4,$0490,$00A6,$00A7,$0401,$00A9,$0404,$00AB,$00AC,$00AD,$00AE,$0407,
224 $00B0,$00B1,$0406,$0456,$0491,$00B5,$00B6,$00B7,$0451,$2116,$0454,$00BB,$0458,$0405,$0455,$0457,
225 $0410,$0411,$0412,$0413,$0414,$0415,$0416,$0417,$0418,$0419,$041A,$041B,$041C,$041D,$041E,$041F,
226 $0420,$0421,$0422,$0423,$0424,$0425,$0426,$0427,$0428,$0429,$042A,$042B,$042C,$042D,$042E,$042F,
227 $0430,$0431,$0432,$0433,$0434,$0435,$0436,$0437,$0438,$0439,$043A,$043B,$043C,$043D,$043E,$043F,
228 $0440,$0441,$0442,$0443,$0444,$0445,$0446,$0447,$0448,$0449,$044A,$044B,$044C,$044D,$044E,$044F
233 for f
:= 0 to High(wc2shitmap
) do wc2shitmap
[f
] := '?';
234 for f
:= 0 to 127 do wc2shitmap
[f
] := AnsiChar(f
);
235 for f
:= 0 to 127 do wc2shitmap
[cp1251
[f
]] := AnsiChar(f
+128);
236 wc2shitmapInited
:= true;
240 // ////////////////////////////////////////////////////////////////////////// //
241 // TODO: make a hash or something
242 function wcharTo1251 (wc
: WideChar): AnsiChar; inline;
244 if not wc2shitmapInited
then initShitMap();
245 if (LongWord(wc
) > 65535) then result
:= '?' else result
:= wc2shitmap
[LongWord(wc
)];
249 // ////////////////////////////////////////////////////////////////////////// //
250 // fast state-machine based UTF-8 decoder; using 8 bytes of memory
251 // code points from invalid range will never be valid, this is the property of the state machine
253 // see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
254 utf8dfa
: array[0..$16c-1] of Byte = (
255 // maps bytes to character classes
256 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 00-0f
257 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 10-1f
258 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 20-2f
259 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 30-3f
260 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 40-4f
261 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 50-5f
262 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 60-6f
263 $00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00,$00, // 70-7f
264 $01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01,$01, // 80-8f
265 $09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09,$09, // 90-9f
266 $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07, // a0-af
267 $07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07,$07, // b0-bf
268 $08,$08,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02, // c0-cf
269 $02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02,$02, // d0-df
270 $0a,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$03,$04,$03,$03, // e0-ef
271 $0b,$06,$06,$06,$05,$08,$08,$08,$08,$08,$08,$08,$08,$08,$08,$08, // f0-ff
272 // maps a combination of a state of the automaton and a character class to a state
273 $00,$0c,$18,$24,$3c,$60,$54,$0c,$0c,$0c,$30,$48,$0c,$0c,$0c,$0c, // 100-10f
274 $0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$00,$0c,$0c,$0c,$0c,$0c,$00, // 110-11f
275 $0c,$00,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$18,$0c,$18,$0c,$0c, // 120-12f
276 $0c,$0c,$0c,$0c,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$18,$0c,$0c, // 130-13f
277 $0c,$0c,$0c,$0c,$0c,$18,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$24, // 140-14f
278 $0c,$24,$0c,$0c,$0c,$24,$0c,$0c,$0c,$0c,$0c,$24,$0c,$24,$0c,$0c, // 150-15f
279 $0c,$24,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c,$0c);
282 // ////////////////////////////////////////////////////////////////////////// //
283 constructor TUtf8DecoderFast
.Create (v
: Boolean{fuck you, fpc}); begin state
:= Accept
; codepoint
:= 0; end;
285 procedure TUtf8DecoderFast
.reset (); inline; begin state
:= Accept
; codepoint
:= 0; end;
287 function TUtf8DecoderFast
.complete (): Boolean; inline; begin result
:= (state
= Accept
); end;
288 function TUtf8DecoderFast
.invalid (): Boolean; inline; begin result
:= (state
= Reject
); end;
289 function TUtf8DecoderFast
.completeOrInvalid (): Boolean; inline; begin result
:= (state
= Accept
) or (state
= Reject
); end;
291 function TUtf8DecoderFast
.decode (c
: AnsiChar): Boolean; inline; overload
; begin result
:= decode(Byte(c
)); end;
293 function TUtf8DecoderFast
.decode (b
: Byte): Boolean; inline; overload
;
297 if (state
= Reject
) then begin state
:= Accept
; codepoint
:= 0; end;
299 if (state
<> Accept
) then codepoint
:= (b
and $3f) or (codepoint
shl 6) else codepoint
:= ($ff shr tp
) and b
;
300 state
:= utf8dfa
[256+state
+tp
];
301 if (state
= Reject
) then begin codepoint
:= Replacement
; state
:= Accept
; end;
302 result
:= (state
= Accept
);
306 // ////////////////////////////////////////////////////////////////////////// //
307 function utfTo1251 (const s
: AnsiString): AnsiString;
310 ud
: TUtf8DecoderFast
;
312 for f
:= 1 to Length(s
) do
314 if (Byte(s
[f
]) > 127) then
316 ud
:= TUtf8DecoderFast
.Create(true);
318 for c
:= 1 to Length(s
) do
320 if ud
.decode(s
[c
]) then result
+= wcharTo1251(WideChar(ud
.codepoint
));
329 // ////////////////////////////////////////////////////////////////////////// //
330 function digitInBase (ch
: AnsiChar; base
: Integer): Integer;
333 if (base
< 1) or (base
> 36) then exit
;
334 if (ch
< '0') then exit
;
337 if (Integer(ch
) >= 48+base
) then exit
;
338 result
:= Integer(ch
)-48;
342 if (ch
>= '0') and (ch
<= '9') then begin result
:= Integer(ch
)-48; exit
; end;
343 if (ch
>= 'a') and (ch
<= 'z') then Dec(ch
, 32); // poor man's tolower()
344 if (ch
< 'A') or (Integer(ch
) >= 65+(base
-10)) then exit
;
345 result
:= Integer(ch
)-65+10;
350 // ////////////////////////////////////////////////////////////////////////// //
351 class function TTextParser
.quote (const s
: AnsiString): AnsiString;
353 function squote (const s
: AnsiString): AnsiString;
358 for f
:= 1 to Length(s
) do
360 if (s
[f
] = '''') then result
+= '''';
366 function dquote (const s
: AnsiString): AnsiString;
372 for f
:= 1 to Length(s
) do
375 if (ch
= #0) then result
+= '\z'
376 else if (ch
= #9) then result
+= '\t'
377 else if (ch
= #10) then result
+= '\n'
378 else if (ch
= #13) then result
+= '\r'
379 else if (ch
= #27) then result
+= '\e'
380 else if (ch
< ' ') or (ch
= #127) then
383 result
+= LowerCase(IntToHex(Integer(ch
), 2));
385 else if (ch
= '"') or (ch
= '\') then
399 needSingle
: Boolean = false;
402 for f
:= 1 to Length(s
) do
404 if (s
[f
] = '''') then begin needSingle
:= true; continue
; end;
405 if (s
[f
] < ' ') or (s
[f
] = #127) then begin result
:= dquote(s
); exit
; end;
407 if needSingle
then result
:= squote(s
) else result
:= ''''+s
+'''';
411 // ////////////////////////////////////////////////////////////////////////// //
412 constructor TTextParser
.Create (loadToken
: Boolean=true);
422 mAllowSignedNumbers
:= true;
423 warmup(); // change `mAllowSignedNumbers` there, if necessary
424 if loadToken
then skipToken();
428 destructor TTextParser
.Destroy ();
434 function TTextParser
.isEOF (): Boolean; inline; begin result
:= (mCurChar
= #0); end;
437 procedure TTextParser
.warmup ();
441 mCurChar
:= mNextChar
;
442 if (mNextChar
<> #0) then loadNextChar();
446 function TTextParser
.skipChar (): Boolean;
448 if (mCurChar
= #0) then begin result
:= false; exit
; end;
449 if (mCurChar
= #10) then begin mCol
:= 1; Inc(mLine
); end else Inc(mCol
);
450 mCurChar
:= mNextChar
;
451 if (mCurChar
= #0) then begin result
:= false; exit
; end;
454 if (mCurChar
= #13) then
456 if (mNextChar
= #10) then loadNextChar();
463 function TTextParser
.skipBlanks (): Boolean;
469 if (curChar
= '/') then
471 // single-line comment
472 if (nextChar
= '/') then
474 while not isEOF
and (curChar
<> #10) do skipChar();
475 skipChar(); // skip EOL
479 if (nextChar
= '*') then
481 // skip comment start
486 if (curChar
= '*') and (nextChar
= '/') then
497 // nesting multline comment
498 if (nextChar
= '+') then
500 // skip comment start
506 if (curChar
= '+') and (nextChar
= '/') then
512 if (level
= 0) then break
;
515 if (curChar
= '/') and (nextChar
= '+') then
517 // skip comment start
528 if (curChar
> ' ') then break
;
529 skipChar(); // skip blank
535 function TTextParser
.skipToken (): Boolean;
537 procedure parseInt ();
539 neg
: Boolean = false;
543 if mAllowSignedNumbers
then
545 if (curChar
= '+') or (curChar
= '-') then
547 neg
:= (curChar
= '-');
549 if (curChar
< '0') or (curChar
> '9') then
552 if (neg
) then mTokChar
:= '-' else mTokChar
:= '+';
557 if (curChar
= '0') then
573 if (base
< 0) then base
:= 10;
574 if (digitInBase(curChar
, base
) < 0) then raise Exception
.Create('invalid number');
576 mTokInt
:= 0; // just in case
579 n
:= digitInBase(curChar
, base
);
580 if (n
< 0) then break
;
582 if (n
< 0) or (n
< mTokInt
) then raise Exception
.Create('integer overflow');
586 // check for valid number end
589 if (curChar
= '.') then raise Exception
.Create('floating numbers aren''t supported yet');
590 if (curChar
= '_') or ((curChar
>= 'A') and (curChar
<= 'Z')) or ((curChar
>= 'a') and (curChar
<= 'z')) or (curChar
>= #128) then
592 raise Exception
.Create('invalid number');
595 if neg
then mTokInt
:= -mTokInt
;
598 procedure parseString ();
604 mTokStr
:= ''; // just in case
606 skipChar(); // skip starting quote
610 if (qch
= '"') and (curChar
= '\') then
612 if (nextChar
= #0) then raise Exception
.Create('unterminated string escape');
614 // skip backslash and escape type
623 'x', 'X': // hex escape
625 n
:= digitInBase(curChar
, 16);
626 if (n
< 0) then raise Exception
.Create('invalid hexstr escape');
628 if (digitInBase(curChar
, 16) > 0) then
630 n
:= n
*16+digitInBase(curChar
, 16);
633 mTokStr
+= AnsiChar(n
);
639 // duplicate single quote (pascal style)
640 if (qch
= '''') and (curChar
= '''') and (nextChar
= '''') then
648 if (curChar
= qch
) then
650 skipChar(); // skip ending quote
658 procedure parseId ();
661 mTokStr
:= ''; // just in case
662 while (curChar
= '_') or ((curChar
>= '0') and (curChar
<= '9')) or
663 ((curChar
>= 'A') and (curChar
<= 'Z')) or
664 ((curChar
>= 'a') and (curChar
<= 'z')) or
678 if not skipBlanks() then
692 if mAllowSignedNumbers
and ((curChar
= '+') or (curChar
= '-')) then begin parseInt(); exit
; end;
693 if (curChar
>= '0') and (curChar
<= '9') then begin parseInt(); exit
; end;
696 if (curChar
= '"') or (curChar
= '''') then begin parseString(); exit
; end;
699 if (curChar
= '_') or ((curChar
>= 'A') and (curChar
<= 'Z')) or ((curChar
>= 'a') and (curChar
<= 'z')) or (curChar
>= #128) then begin parseId(); exit
; end;
703 ',': mTokType
:= TTComma
;
704 ':': mTokType
:= TTColon
;
705 ';': mTokType
:= TTSemi
;
706 '{': mTokType
:= TTBegin
;
707 '}': mTokType
:= TTEnd
;
708 else mTokType
:= TTDelim
;
715 function TTextParser
.expectId (): AnsiString;
717 if (mTokType
<> TTId
) then raise Exception
.Create('identifier expected');
723 procedure TTextParser
.expectId (const aid
: AnsiString);
725 if (mTokType
<> TTId
) or (CompareText(mTokStr
, aid
) <> 0) then raise Exception
.Create('identifier '''+aid
+''' expected');
730 function TTextParser
.eatId (const aid
: AnsiString): Boolean;
733 if (mTokType
<> TTId
) or (CompareText(mTokStr
, aid
) <> 0) then exit
;
739 function TTextParser
.expectStr (allowEmpty
: Boolean=false): AnsiString;
741 if (mTokType
<> TTStr
) then raise Exception
.Create('string expected');
742 if (not allowEmpty
) and (Length(mTokStr
) = 0) then raise Exception
.Create('non-empty string expected');
748 function TTextParser
.expectInt (): Integer;
750 if (mTokType
<> TTInt
) then raise Exception
.Create('string expected');
756 procedure TTextParser
.expectTT (ttype
: Integer);
758 if (mTokType
<> ttype
) then raise Exception
.Create('unexpected token');
763 function TTextParser
.eatTT (ttype
: Integer): Boolean;
765 result
:= (mTokType
= ttype
);
766 if result
then skipToken();
770 function TTextParser
.expectDelim (const ch
: AnsiChar): AnsiChar;
772 if (mTokType
<> TTDelim
) then raise Exception
.Create(Format('delimiter ''%s'' expected', [ch
]));
778 function TTextParser
.eatDelim (const ch
: AnsiChar): Boolean;
781 if (mTokType
<> TTDelim
) or (mTokChar
<> ch
) then exit
;
787 // ////////////////////////////////////////////////////////////////////////// //
788 constructor TFileTextParser
.Create (const fname
: AnsiString; loadToken
: Boolean=true);
790 mFile
:= openDiskFileRO(fname
);
791 inherited Create(loadToken
);
795 destructor TFileTextParser
.Destroy ();
802 procedure TFileTextParser
.loadNextChar ();
806 rd
:= mFile
.Read(mNextChar
, 1);
807 if (rd
= 0) then begin mNextChar
:= #0; exit
; end;
808 if (mNextChar
= #0) then mNextChar
:= ' ';
812 // ////////////////////////////////////////////////////////////////////////// //
813 constructor TStrTextParser
.Create (const astr
: AnsiString; loadToken
: Boolean=true);
817 inherited Create(loadToken
);
821 destructor TStrTextParser
.Destroy ();
828 procedure TStrTextParser
.loadNextChar ();
831 if (mPos
> Length(mStr
)) then exit
;
832 mNextChar
:= mStr
[mPos
]; Inc(mPos
);
833 if (mNextChar
= #0) then mNextChar
:= ' ';
837 // ////////////////////////////////////////////////////////////////////////// //
838 constructor TTextWriter
.Create (); begin mIndent
:= 0; end;
839 procedure TTextWriter
.put (const s
: AnsiString); overload
; begin if (Length(s
) > 0) then putBuf((@(s
[1]))^, Length(s
)); end;
840 procedure TTextWriter
.put (v
: Byte); overload
; begin put('%d', [v
]); end;
841 procedure TTextWriter
.put (v
: Integer); overload
; begin put('%d', [v
]); end;
842 procedure TTextWriter
.put (const fmt
: AnsiString; args
: array of const); overload
; begin put(formatstrf(fmt
, args
)); end;
843 procedure TTextWriter
.putIndent (); var f
: Integer; begin for f
:= 1 to mIndent
do put(' '); end;
844 procedure TTextWriter
.indent (); begin Inc(mIndent
, 2); end;
845 procedure TTextWriter
.unindent (); begin Dec(mIndent
, 2); end;
848 // ////////////////////////////////////////////////////////////////////////// //
849 constructor TFileTextWriter
.Create (const fname
: AnsiString);
851 mFile
:= createDiskFile(fname
);
856 destructor TFileTextWriter
.Destroy ();
863 procedure TFileTextWriter
.putBuf (constref buf
; len
: SizeUInt
);
870 mFile
.WriteBuffer(pc
^, len
);