DEADSOFTWARE

added mapio generator (written in D)
[d2df-sdl.git] / src / tools / mapiogen / lexer.d
1 /* coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
2 * Understanding is not required. Only obedience.
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17 module lexer;
18 static assert(__VERSION__ >= 2071, "you need as least DMD 2.071 to compile this code");
21 // ////////////////////////////////////////////////////////////////////////// //
22 static if (!is(typeof(usize))) private alias usize = size_t;
25 // ////////////////////////////////////////////////////////////////////////// //
26 public struct Loc {
27 string file;
28 int line, col;
29 uint tpos;
31 string toString () const { import std.string : format; return "%s (%s,%s)".format(file, line, col); }
32 string toStringNoFile () const { import std.string : format; return "(%s,%s)".format(line, col); }
34 @property bool valid () const pure nothrow @safe @nogc { pragma(inline, true); return (line > 0 && col > 0); }
35 }
38 // ////////////////////////////////////////////////////////////////////////// //
39 public class ErrorAt : Exception {
40 Loc loc;
42 this (string msg, Throwable next=null, string file=__FILE__, usize line=__LINE__) pure nothrow @safe @nogc { super(msg, file, line, next); }
43 this (in Loc aloc, string msg, Throwable next=null, string file=__FILE__, usize line=__LINE__) pure nothrow @safe @nogc { loc = aloc; super(msg, file, line, next); }
44 }
47 // ////////////////////////////////////////////////////////////////////////// //
48 public struct Token {
49 public:
50 enum Type {
51 EOF = -1,
52 Id,
53 Str,
54 Num,
55 Spec,
56 }
58 private:
59 const(char)[] tkstr;
61 public:
62 Loc loc, eloc; // token start, token end (after last char)
63 Type type = Type.EOF; // token type
64 long num; // should be enough for everyone
66 @safe:
67 void mustbeType (Token.Type tp, string msg="identifier expected", string file=__FILE__, usize line=__LINE__) {
68 pragma(inline, true);
69 if (type != tp) throw new ErrorAt(loc, msg, null, file, line);
70 }
71 void mustbeId (string msg="identifier expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); mustbeType(Type.Id, msg, file, line); }
72 void mustbeStr (string msg="string expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); mustbeType(Type.Str, msg, file, line); }
73 void mustbeNum (string msg="number expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); mustbeType(Type.Num, msg, file, line); }
74 void mustbeSpec (string msg="punctuation expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); mustbeType(Type.Spec, msg, file, line); }
76 string toString () const @trusted {
77 import std.string : format;
78 final switch (type) with (Type) {
79 case EOF: return "(%s,%d): <EOF>".format(loc.line, loc.col);
80 case Id: return "(%s,%d): Id:%s".format(loc.line, loc.col, tkstr);
81 case Str: return "(%s,%d): Str:%s".format(loc.line, loc.col, Lexer.quote(tkstr));
82 case Num: return "(%s,%d): Num:%s".format(loc.line, loc.col, num);
83 case Spec: return "(%s,%d): Spec:<%s>".format(loc.line, loc.col, tkstr);
84 }
85 assert(0);
86 }
88 nothrow:
89 // get immutable string
90 // this converts id to `string` via `.idup`, use with caution!
91 // `.idup` is used to not anchor the whole source string
92 @property string istr () const { pragma(inline, true); return (tkstr.length ? tkstr.idup : null); }
94 const pure nothrow @nogc @property:
95 const(char)[] str () { pragma(inline, true); return tkstr; }
96 bool isId () { pragma(inline, true); return (type == Type.Id); }
97 bool isStr () { pragma(inline, true); return (type == Type.Str); }
98 bool isNum () { pragma(inline, true); return (type == Type.Num); }
99 bool isSpec () { pragma(inline, true); return (type == Type.Spec); }
100 bool isEOF () { pragma(inline, true); return (type == Type.EOF); }
104 // ////////////////////////////////////////////////////////////////////////// //
105 public final class Lexer {
106 private:
107 const(char)[] text;
108 uint tpos;
109 Loc cpos; // position for last `getChar()`
110 Loc pend; // end of previous token, for better error messages
111 bool eof;
112 bool lastWasEOL = true;
113 Token[] lookup;
114 Token tokeof; // will be fixed by `nextToken()`
116 public:
117 this(T) (const(char)[] atext, T afname=null) if (is(T : const(char)[])) {
118 text = atext;
119 if (afname.length > 0) { static if (is(T == string)) cpos.file = afname; else cpos.file = afname.idup; }
120 tokeof.loc.file = cpos.file;
121 nextToken();
122 pend.line = 1;
123 pend.col = 1;
124 pend.tpos = 0;
127 void error (string msg, string file=__FILE__, usize line=__LINE__) {
128 pragma(inline, true);
129 throw new ErrorAt((lookup.length == 0 ? loc : lookup[0].loc), msg, null, file, line);
132 static private void error (in ref Token tk, string msg, string file=__FILE__, usize line=__LINE__) {
133 pragma(inline, true);
134 throw new ErrorAt(tk.loc, msg, null, file, line);
137 static private void error() (in auto ref Loc loc, string msg, string file=__FILE__, usize line=__LINE__) {
138 pragma(inline, true);
139 throw new ErrorAt(loc, msg, null, file, line);
142 const(char)[] line (uint idx) {
143 if (idx == 0) ++idx;
144 uint pos = 0;
145 while (--idx > 0) {
146 while (pos < text.length && text.ptr[pos] != '\n') ++pos;
147 ++pos;
149 if (pos >= text.length) return null;
150 uint epos = pos;
151 while (epos < text.length && text.ptr[epos] != '\n') ++epos;
152 while (epos > pos && text.ptr[epos-1] <= ' ') --epos;
153 return text[pos..epos];
156 void popFront () {
157 if (lookup.length > 0) {
158 pend = lookup.ptr[0].eloc;
159 ++pend.col; // for better error messages
160 ++pend.tpos; // to be consistent
161 foreach (immutable idx; 1..lookup.length) lookup.ptr[idx-1] = lookup.ptr[idx];
162 lookup.length -= 1;
163 lookup.assumeSafeAppend;
165 nextToken();
168 @property pure nothrow @safe @nogc {
169 bool empty () const { pragma(inline, true); return (lookup.length == 0); }
170 ref inout(Token) front () inout { pragma(inline, true); return (lookup.length ? lookup.ptr[0] : tokeof); }
171 // current token's loc
172 auto loc () inout { pragma(inline, true); return front.loc; }
173 auto eloc () inout { pragma(inline, true); return front.eloc; }
174 auto peloc () inout { pragma(inline, true); return pend; }
176 bool isId () const { pragma(inline, true); return front.isId; }
177 bool isStr () const { pragma(inline, true); return front.isStr; }
178 bool isNum () const { pragma(inline, true); return front.isNum; }
179 bool isSpec () const { pragma(inline, true); return front.isSpec; }
182 // this eats identifier
183 void expect (const(char)[] id, string file=__FILE__, usize line=__LINE__) {
184 if (!front.isId || front.str != id) error(loc, "`"~id.idup~"` expected", file, line);
185 popFront();
188 // this eats identifier
189 void expectCI (const(char)[] id, string file=__FILE__, usize line=__LINE__) {
190 if (front.isId && id.length == front.str.length) {
191 bool ok = true;
192 foreach (immutable idx, char ch; front.str) {
193 if (ch >= 'A' && ch <= 'Z') ch += 32; // poor man's `tolower()`
194 char c1 = id[idx];
195 if (c1 >= 'A' && c1 <= 'Z') c1 += 32; // poor man's `tolower()`
196 if (ch != c1) { ok = false; break; }
198 if (ok) { popFront(); return; }
200 error(loc, "`"~id.idup~"` expected", file, line);
203 auto expectSpec (string msg="punctuation expected", string file=__FILE__, usize line=__LINE__) {
204 mustbeSpec(msg, file, line);
205 auto res = lookup[0].str;
206 popFront();
207 return res;
210 // this converts id to `string` via `.idup`, use with caution!
211 // `.idup` is used to not anchor the whole source string
212 string expectId (string msg="identifier expected", string file=__FILE__, usize line=__LINE__) {
213 mustbeId(msg, file, line);
214 auto res = lookup[0].istr;
215 popFront();
216 return res;
219 // this converts id to `string` via `.idup`, use with caution!
220 // `.idup` is used to not anchor the whole source string
221 string expectStr (string msg="string expected", string file=__FILE__, usize line=__LINE__) {
222 //pragma(inline, true);
223 mustbeStr(msg, file, line);
224 auto res = lookup[0].istr;
225 popFront();
226 return res;
229 // `mustbe` doesn't eat token
230 void mustbeType (Token.Type tp, string msg="identifier expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); return front.mustbeType(tp, msg, file, line); }
231 void mustbeId (string msg="identifier expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); return front.mustbeId(msg, file, line); }
232 void mustbeStr (string msg="string expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); return front.mustbeStr(msg, file, line); }
233 void mustbeNum (string msg="number expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); return front.mustbeNum(msg, file, line); }
234 void mustbeSpec (string msg="punctuation expected", string file=__FILE__, usize line=__LINE__) { pragma(inline, true); return front.mustbeSpec(msg, file, line); }
236 bool eat (const(char)[] id) {
237 if (front.isId && front.str == id) { popFront(); return true; }
238 return false;
241 const(char)[] eatCI (const(char)[] id) {
242 if (front.isId && id.length == front.str.length) {
243 bool ok = true;
244 foreach (immutable idx, char ch; front.str) {
245 if (ch >= 'A' && ch <= 'Z') ch += 32; // poor man's `tolower()`
246 char c1 = id[idx];
247 if (c1 >= 'A' && c1 <= 'Z') c1 += 32; // poor man's `tolower()`
248 if (ch != c1) { ok = false; break; }
250 if (ok) { auto res = front.str; popFront(); return res; }
252 return null;
255 ref Token peek (uint dist) {
256 while (!eof && lookup.length <= dist) nextToken();
257 return (dist < lookup.length ? lookup.ptr[dist] : tokeof);
260 ref Token opIndex (usize dist) { pragma(inline, true); return peek(dist); }
262 // return loc for next `getChar()`
263 Loc nextLoc () nothrow @safe @nogc {
264 Loc res = cpos;
265 if (lastWasEOL) { ++res.line; res.col = 1; } else ++res.col;
266 return res;
269 char peekChar (uint dist=0) nothrow @trusted @nogc {
270 pragma(inline, true);
271 return (tpos+dist >= text.length ? '\0' : (text.ptr[tpos+dist] ? text.ptr[tpos+dist] : ' '));
274 // return char or 0
275 char getChar () nothrow @trusted @nogc {
276 if (tpos >= text.length) { tpos = text.length; eof = true; }
277 if (eof) return '\0';
278 cpos.tpos = tpos;
279 char ch = text.ptr[tpos++];
280 if (ch == '\0') ch = ' ';
281 if (lastWasEOL) { ++cpos.line; cpos.col = 1; } else ++cpos.col;
282 lastWasEOL = (ch == '\n');
283 return ch;
286 // skip blanks and comments
287 //TODO: make special "comment" token(s)?
288 void skipBlanks () @safe {
289 for (;;) {
290 char ch = peekChar;
291 if (ch == '/' && peekChar(1) == '/') {
292 // single-line comment
293 do { ch = getChar(); } while (ch != 0 && ch != '\n');
294 continue;
295 } else if (ch == '(' && peekChar(1) == '*') {
296 getChar(); // skip starting char
297 auto lc = cpos;
298 getChar(); // skip star
299 char pch = ' ';
300 ch = ' '; // we need this
301 for (;;) {
302 pch = ch;
303 ch = getChar();
304 if (ch == 0) error(lc, "unterminated comment");
305 if (ch == ')' && pch == '*') break;
307 continue;
308 } else if (ch == '{') {
309 getChar(); // skip starting char
310 auto lc = cpos;
311 do {
312 ch = getChar();
313 if (ch == 0) error(lc, "unterminated comment");
314 } while (ch != '}');
315 continue;
317 if (ch == 0 || ch > 32) return;
318 getChar();
322 private void nextToken () {
323 if (eof) return;
325 skipBlanks();
326 if (peekChar == '\0') {
327 eof = true;
328 tokeof.loc = cpos;
329 tokeof.eloc = cpos;
330 return;
333 Token tk;
334 auto tkspos = tpos;
335 char ch = getChar();
336 tk.loc = cpos;
338 // quoted string
339 if (ch == '"' || ch == '\'') {
340 char ech = ch;
341 tk.type = Token.Type.Str;
342 ++tkspos; // skip quote
343 for (;;) {
344 ch = getChar();
345 if (ch == 0) error(tk, "unterminated string");
346 if (ch == ech) break;
348 tk.tkstr = text[tkspos..tpos-1]; // -1 due to eaten quote
349 tk.eloc = cpos;
350 lookup ~= tk;
351 return;
354 // hex number
355 if (ch == '$') {
356 long n = 0;
357 tk.type = Token.Type.Num;
358 getChar(); // skip dollar
359 int dv = digitValue(peekChar);
360 if (dv < 0 || dv > 15) error(tk, "hex number expected");
361 for (;;) {
362 dv = digitValue(peekChar);
363 if (dv < 0 || dv > 15) break;
364 n = n*16+dv;
365 getChar();
367 ch = peekChar;
368 if (isIdChar(ch) || ch == '.') error(tk, "hex number expected");
369 tk.num = n;
370 tk.tkstr = text[tkspos..tpos];
371 tk.eloc = cpos;
372 lookup ~= tk;
373 return;
376 // number
377 if (isDigit(ch)) {
378 long n = ch-'0';
379 tk.type = Token.Type.Num;
380 for (;;) {
381 if (!isDigit(peekChar)) break;
382 ch = getChar();
383 n = n*10+ch-'0';
385 tk.num = n;
386 tk.tkstr = text[tkspos..tpos];
387 tk.eloc = cpos;
388 ch = peekChar;
389 if (isIdChar(ch)) error(tk, "invalid number");
390 lookup ~= tk;
391 return;
394 // identifier
395 if (isIdStart(ch)) {
396 tk.type = Token.Type.Id;
397 while (isIdChar(peekChar)) getChar();
398 tk.tkstr = text[tkspos..tpos];
399 tk.eloc = cpos;
400 lookup ~= tk;
401 return;
404 static immutable string[9] longSpecs = [
405 "<=",
406 ">=",
407 ":=",
408 "<>",
409 "+=",
410 "-=",
411 "*=",
412 "/=",
413 "..",
414 ];
415 enum MaxSpecLength = {
416 int ml = 0;
417 foreach (string s; longSpecs) if (s.length > ml) ml = cast(int)s.length;
418 return ml;
419 }();
421 // delimiter
422 char[MaxSpecLength] dbuf;
423 dbuf[0] = ch;
424 uint len = 0;
425 for (;;) {
426 ch = dbuf[len];
427 bool found = false;
428 foreach (string s; longSpecs) if (len < s.length && s[len] == ch) { found = true; break; }
429 if (!found) break;
430 if (len > 0) getChar(); // this char should be eaten
431 if (++len >= MaxSpecLength) break;
432 dbuf[len] = peekChar(0);
434 tk.type = Token.Type.Spec;
435 tk.tkstr = text[tkspos..tpos];
436 tk.eloc = cpos;
437 lookup ~= tk;
440 auto select(RetType, string mode="peek", A...) (scope A args) { pragma(inline, true); return selectN!(RetType, mode)(0, args); }
442 auto selectN(RetType, string mode="peek", A...) (usize n, scope A args) {
443 import std.traits : ReturnType;
445 static assert(mode == "peek" || mode == "pop" || mode == "pop-nondefault", "selectN: invalid mode: '"~mode~"'");
447 template isGoodDg(usize idx, T) {
448 private import std.traits;
449 static if (idx < A.length && isCallable!(A[idx]) && arity!(args[idx]) == 1) {
450 enum isGoodDg = is(Parameters!(A[idx])[0] == T);
451 } else {
452 enum isGoodDg = false;
456 template isGoodArglessDg(usize idx) {
457 private import std.traits;
458 static if (idx < A.length && isCallable!(A[idx]) && arity!(args[idx]) == 0) {
459 enum isGoodArglessDg = true;
460 } else {
461 enum isGoodArglessDg = false;
465 // sorry, but this has to be string mixin, due to possible empty `arg`
466 enum DoCallDg(string arg) =
467 "static if (!is(ReturnType!(A[xidx]) == void)) return cast(RetType)(args[xidx]("~arg~")); else { args[xidx]("~arg~"); return RetType.init; }";
469 // we can't have inner mixin templates, so... sorry, it's string again
470 enum CallDg = q{
471 static if (isGoodDg!(xidx, Token)) { mixin(DoCallDg!"tk"); }
472 else static if (isGoodDg!(xidx, Loc)) { mixin(DoCallDg!"tk.loc"); }
473 else static if (isGoodDg!(xidx, Token.Type)) { mixin(DoCallDg!"tk.type"); }
474 else static if (isGoodDg!(xidx, Keyword)) { mixin(DoCallDg!"tk.Kw"); }
475 else static if (isGoodArglessDg!(xidx)) { mixin(DoCallDg!""); }
476 else static assert(0, "selectN: invalid delegate #"~xidx.stringof);
477 };
479 auto tk = peek(n);
480 bool found = false;
481 foreach (immutable aidx, immutable arg; args) {
482 static if (aidx%2 == 0) {
483 static if (is(typeof(arg) == Keyword) || is(typeof(arg) == Token.Type)) {
484 static if (is(typeof(arg) == Keyword)) found = (tk == arg);
485 else static if (is(typeof(arg) == Token.Type)) found = (tk.type == arg);
486 else static assert(0, "wtf?!");
487 if (found) {
488 // process `mode`
489 static if (mode != "peek") popFront();
490 // call delegate
491 enum xidx = aidx+1;
492 mixin(CallDg);
494 } else {
495 // default
496 // process `mode`
497 static if (mode == "pop") popFront();
498 // call delegate
499 enum xidx = aidx;
500 mixin(CallDg);
504 error(tk, "selectN is out of nodes");
505 assert(0);
508 static:
509 private immutable byte[256] digitValues = {
510 byte[256] res = -1;
511 foreach (ubyte idx; '0'..'9'+1) res[idx] = cast(byte)(idx-'0');
512 foreach (ubyte idx; 'A'..'Z'+1) res[idx] = cast(byte)(idx-'A'+10);
513 foreach (ubyte idx; 'a'..'z'+1) res[idx] = cast(byte)(idx-'a'+10);
514 return res;
515 }();
517 private immutable bool[256] idStartChars = {
518 bool[256] res = false;
519 foreach (ubyte idx; 'A'..'Z'+1) res[idx] = true;
520 foreach (ubyte idx; 'a'..'z'+1) res[idx] = true;
521 res['_'] = true;
522 return res;
523 }();
525 private immutable bool[256] idChars = {
526 bool[256] res = false;
527 foreach (ubyte idx; '0'..'9'+1) res[idx] = true;
528 foreach (ubyte idx; 'A'..'Z'+1) res[idx] = true;
529 foreach (ubyte idx; 'a'..'z'+1) res[idx] = true;
530 res['_'] = true;
531 return res;
532 }();
534 bool isDigit() (char ch) { pragma(inline, true); return (ch >= '0' && ch <= '9'); }
535 int digitValue() (char ch) { pragma(inline, true); return digitValues.ptr[cast(ubyte)ch]; }
536 bool isIdStart() (char ch) { pragma(inline, true); return idStartChars.ptr[cast(ubyte)ch]; }
537 bool isIdChar() (char ch) { pragma(inline, true); return idChars.ptr[cast(ubyte)ch]; }
539 string gmlQuote (const(char)[] s) {
540 import std.array : appender;
541 auto res = appender!string();
542 enum Prev { Nothing, Char, Spec }
543 Prev prev = Prev.Nothing;
544 foreach (char ch; s) {
545 if (ch < ' ' || ch == 127 || ch == '"') {
546 import std.conv : to;
547 final switch (prev) with (Prev) {
548 case Nothing: break;
549 case Char: res.put(`"+`); break;
550 case Spec: res.put(`+`); break;
552 prev = Prev.Spec;
553 res.put("chr(");
554 res.put(to!string(cast(uint)ch));
555 res.put(")");
556 } else {
557 final switch (prev) with (Prev) {
558 case Nothing: res.put('"'); break;
559 case Char: break;
560 case Spec: res.put(`+"`); break;
562 prev = Prev.Char;
563 res.put(ch);
566 if (prev == Prev.Nothing) return `""`;
567 if (prev == Prev.Char) res.put('"');
568 return res.data;
571 /// quote string: append double quotes, screen all special chars;
572 /// so quoted string forms valid D string literal.
573 /// allocates.
574 string quote (const(char)[] s) {
575 import std.array : appender;
576 import std.format : formatElement, FormatSpec;
577 auto res = appender!string();
578 FormatSpec!char fspc; // defaults to 's'
579 formatElement(res, s, fspc);
580 return res.data;
585 version(lexer_test) unittest {
586 import std.file;
587 import std.stdio;
588 //enum FName = "z00.txt";
589 enum FName = "shared/MAPDEF.pas";
590 string s;
592 auto fl = File(FName);
593 auto buf = new char[](cast(uint)fl.size);
594 fl.rawRead(buf[]);
595 s = cast(string)buf;
597 auto lex = new Lexer(s, FName);
598 try {
599 while (!lex.empty) {
600 writeln(lex.front);
601 lex.popFront();
603 } catch (ErrorAt e) {
604 writeln("PARSE ERROR: ", e.line);
605 writeln(e.loc);