1 /// Author: Aziz Köksal
2 /// License: GPL3
3 /// $(Maturity very high)
4 module dil.lexer.Lexer;
6 import dil.lexer.Token,
7        dil.lexer.Funcs,
8        dil.lexer.Identifier,
9        dil.lexer.IDsEnum,
10        dil.lexer.TokenSerializer,
11        dil.lexer.Tables;
12 import dil.i18n.Messages;
13 import dil.Diagnostics,
14        dil.HtmlEntities,
15        dil.ChunkAllocator,
16        dil.Array,
17        dil.Version,
18        dil.Unicode,
19        dil.SourceText,
20        dil.Time,
21        dil.String;
22 import dil.Float : Float;
23 import util.uni : isUniAlpha;
24 import common;
26 /// The Lexer analyzes the characters of a source text and
27 /// produces an array of tokens.
28 class Lexer
29 {
30   cchar* p; /// Points to the current character in the source text.
31   cchar* end; /// Points one character past the end of the source text.
32   SourceText srcText; /// The source text.
34   TokenArray tokens; /// Array of Tokens.
35   LexerTables tables; /// Used to look up token values.
36   CharArray buffer; /// A buffer for string values.
37   ChunkAllocator allocator; /// Allocates memory for non-token structs.
39   /// Groups line information.
40   static struct LineLoc
41   {
42     cchar* p; /// Points to the first character of the current line.
43     uint n; /// Actual source text line number.
44   }
45   LineLoc lineLoc; /// Current line.
47   uint inTokenString; /// > 0 if inside q{ }
48   /// Holds the original file path and the modified one (by #line.)
49   Token.HashLineInfo* hlinfo; /// Info set by "#line".
51   // Members used for error messages:
52   Diagnostics diag; /// For diagnostics.
53   LexerError[] errors; /// List of errors.
54   // End of variable members.
56   alias T = S2T; /// Converts, e.g., T!"+" to TOK.Plus.
58   static
59   {
60   const ushort chars_r = castInt(`r"`); /// `r"` as a ushort.
61   const ushort chars_x = castInt(`x"`); /// `x"` as a ushort.
62   const ushort chars_q = castInt(`q"`); /// `q"` as a ushort.
63   const ushort chars_q2 = castInt(`q{`); /// `q{` as a ushort.
64   const ushort chars_shebang = castInt("#!"); /// `#!` as a ushort.
65   const uint chars_line = castInt("line"); /// `line` as a uint.
66   }
68   /// Constructs a Lexer object.
69   /// Params:
70   ///   srcText = The UTF-8 source code.
71   ///   tables = Used to look up identifiers and token values.
72   ///   diag = Used for collecting error messages.
73   this(SourceText srcText, LexerTables tables, Diagnostics diag = null)
74   {
75     version(gc_tokens)
76     {}
77     else
78       this.allocator.initialize(PAGESIZE);
79     this.srcText = srcText;
80     this.tables = tables;
81     this.diag = diag ? diag : new Diagnostics();
82     assert(text.length >= 4 && text[$-4..$] == SourceText.sentinelString,
83       "source text has no sentinel character");
84     this.p = text.ptr;
85     this.end = this.p + text.length; // Point past the sentinel string.
86     this.lineLoc.p = this.p;
87     this.lineLoc.n = 1;
88   }
90   ~this()
91   {
92     allocator.destroy();
93   }
95   /// Returns the next free token from the array.
96   /// NB: The bytes are not zeroed out.
97   Token* newToken()
98   {
99     if (tokens.rem == 0)
100       tokens.growX1_5();
101     return tokens.cur++;
102   }
104   /// Allocates memory for T.
105   T* new_(T)()
106   {
107     version (gc_tokens) // Use to test GC instead of custom allocator.
108       return new T;
109     else
110     {
111       auto t = cast(T*)allocator.allocate(T.sizeof);
112       *t = T.init;
113       return t;
114     }
115   }
117   /// Callback function to TokenSerializer.deserialize().
118   bool dlxCallback(Token* t)
119   {
120     switch (t.kind)
121     { // Some tokens need special handling:
122     case T!"Newline":
123       setLineBegin(t.end);
124       t.nlval = lookupNewline();
125       break;
126     case T!"Character": // May have escape sequences.
127       this.p = t.start;
128       scanCharacter(t);
129       break;
130     case T!"String": // Escape sequences; token strings; etc.
131       this.p = t.start;
132       dchar c = *cast(ushort*)p;
133       switch (c)
134       {
135       case chars_r:
136         ++this.p, scanRawString(t); break;
137       case chars_x:
138         scanHexString(t); break;
139       version(D2)
140       {
141       case chars_q:
142         scanDelimitedString(t); break;
143       case chars_q2:
144         scanTokenString(t); break;
145       }
146       default:
147       }
148       switch (*p)
149       {
150       case '`':
151         scanRawString(t); break;
152       case '"':
153         scanNormalString(t); break;
154       version(D1)
155       { // Only in D1.
156       case '\\':
157         scanEscapeString(t); break;
158       }
159       default:
160       }
161       break;
162     case T!"Comment": // Just rescan for newlines.
163       if (t.isMultiline) // Mutliline tokens may have newlines.
164         for (auto p = t.start, end = t.end; p < end;)
165           if (scanNewline(p))
166             setLineBegin(p);
167           else
168             ++p;
169       break;
170     case T!"Int32", T!"Int64", T!"UInt32", T!"UInt64":
171       this.p = t.start;
172       scanNumber(t); // Complicated. Let the method handle this.
173       break;
174     case T!"Float32", T!"Float64", T!"Float80",
175          T!"IFloat32", T!"IFloat64", T!"IFloat80":
176       // The token is complete. What remains is to get its value.
177       t.mpfloat = lookupFloat(copySansUnderscores(t.start, t.end));
178       break;
179     case T!"#line":
180       this.p = t.start;
181       scanSpecialTokenSequence(t); // Complicated. Let the method handle this.
182       break;
183     case T!"#!Shebang", T!"Empty": // Whitespace tokens.
184       break;
185     default:
186     }
187     return true;
188   }
190   /// Loads the tokens from a dlx file.
191   bool fromDLXFile(ubyte[] data)
192   {
193     auto dlxTokens = TokenSerializer.deserialize(
194       data, this.text(), tables.idents, &dlxCallback);
195     if (dlxTokens.length)
196     {
197       alias ts = dlxTokens;
198       ts[0] = Token.init; // NullToken
199       ts[1].kind = T!"HEAD";
200       ts[1].ws = null;
201       ts[1].start = ts[1].end = this.text.ptr;
202       ts[1].pvoid = null;
203       ts[2].kind = T!"Newline";
204       ts[2].ws = null;
205       ts[2].start = ts[2].end = this.text.ptr;
206       ts[2].nlval = lookupNewline();
207       ts[$-1] = Token.init; // NullToken
208       this.p = ts[$-2].end;
209       tokens.ptr = ts.ptr;
210       tokens.cur = tokens.end = ts.ptr + ts.length;
211     }
212     else
213     { /// Function failed. Reset...
214       this.p = this.text.ptr;
215       this.lineLoc.p = this.p;
216       this.lineLoc.n = 1;
217     }
218     return !!dlxTokens.length;
219   }
221   /// Acquires the current buffer.
222   CharArray getBuffer()
223   {
224     auto buffer = this.buffer;
225     this.buffer = buffer.init;
226     return buffer;
227   }
229   /// Takes over buffer if its capacity is greater than the current one.
230   void setBuffer(CharArray buffer)
231   {
232     buffer.len = 0;
233     if (buffer.cap > this.buffer.cap)
234       this.buffer = buffer;
235   }
237   /// Returns the source text string.
238   cstring text()
239   {
240     return srcText.data;
241   }
243   /// Returns the end pointer excluding the sentinel string.
244   cchar* endX()
245   {
246     return this.end - SourceText.sentinelString.length;
247   }
249   /// Returns the first token of the source text.
250   /// This can be the EOF token.
251   /// Structure: [NullToken, HEAD, Newline, FirstToken, ..., NullToken]
252   Token* firstToken()
253   {
254     return tokens.ptr + 3;
255   }
257   /// Returns the list of tokens excluding special beginning and end tokens.
258   Token[] tokenList()
259   {
260     return firstToken[0 .. tokens.len-4];
261   }
263   /// Returns the HEAD token.
264   Token* head()
265   {
266     return tokens.ptr + 1;
267   }
269   /// Returns the EOF token.
270   Token* lastToken()
271   {
272     return tokens.cur - 2;
273   }
275   /// Sets the value of the special token.
276   void finalizeSpecialToken(Token* t)
277   {
278     assert(t.kind == T!"SpecialID" && t.text[0..2] == "__");
279     cstring str;
280     switch (t.ident.idKind)
281     {
282     case IDK.FILE:
283       str = errorFilePath();
284       break;
285     case IDK.LINE:
286       t.sizet_ = this.errorLineNumber(this.lineNum);
287       break;
289       str = Time.now();
290       switch (t.kind)
291       {
292       case IDK.DATE:
293         str = Time.month_day(str) ~ ' ' ~ Time.year(str); break;
294       case IDK.TIME:
295         str = Time.time(str); break;
296       case IDK.TIMESTAMP:
297         break; // str is the timestamp.
298       default: assert(0);
299       }
300       break;
301     case IDK.VENDOR:
302       str = VENDOR;
303       break;
304     case IDK.VERSION:
305       t.uint_ = VERSION_MAJOR*1000 + VERSION_MINOR;
306       break;
307     case IDK.EOF:
308       assert(t.text == "__EOF__");
309       t.kind = T!"EOF"; // Convert to EOF token, so that the Parser will stop.
310       break;
311     default:
312       assert(0);
313     }
314     if (str.ptr)
315       t.strval = lookupString(str, '\0');
316   }
318   /// Returns the current line number.
319   size_t lineNum()
320   {
321     return lineLoc.n;
322   }
324   /// Sets the line pointer and increments the line number.
325   private void setLineBegin(cchar* p)
326   {
327     assert(isNewlineEnd(p - 1));
328     lineLoc.p = p;
329     lineLoc.n++;
330   }
332   /// Returns true if p points to the last character of a Newline.
333   bool isNewlineEnd(cchar* p)
334   {
335     assert(p >= text.ptr && p < end);
336     return (*p).In('\n', '\r') || (p-=2) >= text.ptr && p[0..3].In(LS, PS);
337   }
339   /// Returns true if p points inside the source text.
340   bool isInText(cchar* p)
341   {
342     return text.ptr <= p && p < end;
343   }
345   alias StringValue = Token.StringValue;
346   alias IntegerValue = Token.IntegerValue;
347   alias NewlineValue = Token.NewlineValue;
349   /// Looks up a StringValue. Copies str if it's not a slice from the src text.
350   StringValue* lookupString(cstring str, char postfix)
351   {
352     return tables.lookupString(str, postfix, !isInText(str.ptr));
353   }
355   /// Forwards to tables.lookupString().
356   cbinstr lookupString(cbinstr bstr)
357   {
358     auto str = cast(cstring)bstr;
359     return tables.lookupString(hashOf(str), str);
360   }
362   /// Looks up a Float in the table.
363   /// Params:
364   ///   str = The zero-terminated string of the float number.
365   Float lookupFloat(cstring str)
366   {
367     assert(str.length && str[$-1] == 0);
368     auto hash = hashOf(str);
369     auto pFloat = hash in tables.floats;
370     if (!pFloat)
371     {
372       int precision;
373       auto f = new Float(precision, str);
374       // if (precision == 0) // Exact precision.
375       // {}
376       // else if (precision < 0) // Lower precision.
377       // {}
378       // else /*if (precision > 0)*/ // Higher precision.
379       // {}
380       tables.floats[hash] = f;
381       return f;
382     }
383     return *pFloat;
384   }
386   /// Looks up a newline value.
387   NewlineValue* lookupNewline()
388   {
389     auto lineNum = this.lineNum;
390     if (hlinfo)
391     { // Don't insert into the table, when '#line' tokens are in the text.
392       // This could be optimised with another table.
393       auto nl = new_!(NewlineValue);
394       nl.lineNum = lineNum;
395       auto hlinfo = nl.hlinfo = new_!(Token.HashLineInfo);
396       *hlinfo = *this.hlinfo;
397       return nl;
398     }
399     return tables.lookupNewline(lineNum);
400   }
402   /// Advance t one token forward.
403   void peek(ref Token* t)
404   {
405     t++;
406     assert(tokens.ptr <= t && t < tokens.cur);
407   }
409   /// Scans the whole source text until EOF is encountered.
410   void scanAll()
411   { // The divisor 6 is an average measured by lexing large D projects.
412     auto estimatedNrOfTokens = text.length / 6;
413     tokens.cap = estimatedNrOfTokens;
414     if (tokens.cap < 5)
415       tokens.cap = 5; // Guarantee space for at least 5 tokens.
416     auto first = newToken();
417     *first = Token.init;
418     auto head = newToken();
419     head.kind = T!"HEAD";
420     head.ws = null;
421     head.start = head.end = this.p;
422     head.pvoid = null;
423     // Add a "virtual" newline as the first token after the head.
424     auto newline = newToken();
425     newline.kind = T!"Newline";
426     newline.ws = null;
427     newline.start = newline.end = this.p;
428     newline.nlval = lookupNewline();
429     // Scan optional shebang.
430     if (*cast(ushort*)this.p == chars_shebang)
431       scanShebang();
432     // Main loop scanning the whole text.
433     Token* t;
434     do
435       scan(t = newToken());
436     while (t.kind != T!"EOF");
437     // Add a terminating token, similar to 0 in C-like strings.
438     auto last = newToken();
439     *last = Token.init;
441     auto toks = tokenList;
442     foreach (x; toks)
443     {}
444   }
446   /// The "shebang" may optionally appear once at the beginning of a file.
447   /// $(BNF Shebang := "#!" AnyChar* EndOfLine)
448   void scanShebang()
449   {
450     auto p = this.p;
451     assert(p[0..2] == "#!");
452     auto t = newToken();
453     t.kind = T!"#!Shebang";
454     t.start = p++;
455     while (!isEndOfLine(++p))
456       isascii(*p) || decodeUTF8(p);
457     t.end = this.p = p;
458     t.pvoid = null;
459   }
461   /// The main method which recognizes the characters that make up a token.
462   ///
463   /// Complicated tokens are scanned in separate methods.
464   public void scan(Token* t)
465   in
466   {
467     assert(text.ptr <= p && p < end);
468   }
469   out
470   {
471     assert(text.ptr <= t.start && t.start < end, t.kind.toString);
472     assert(text.ptr <= t.end && t.end <= end, t.kind.toString);
473     assert(t.kind != T!"Invalid", t.text);
474   }
475   body
476   {
477     TOK kind; // The token kind that will be assigned to t.kind.
478     auto p = this.p; // Incrementing a stack variable is faster.
479     // Scan whitespace.
480     if (isspace(*p))
481     {
482       t.ws = p;
483       while (isspace(*++p))
484       {}
485     }
486     else
487       t.ws = null;
488     t.pvoid = null;
490     // Scan the text of the token.
491     dchar c = *p;
492     {
493       t.start = this.p = p;
495       // Identifier or string literal.
496       if (isidbeg(c))
497       {
498         c = *cast(ushort*)p;
499         if (c == chars_r)
500           return ++this.p, scanRawString(t);
501         if (c == chars_x)
502           return scanHexString(t);
503         version(D2)
504         {
505         if (c == chars_q)
506           return scanDelimitedString(t);
507         if (c == chars_q2)
508           return scanTokenString(t);
509         }
511         // Scan identifier.
512       Lidentifier:
513         do
514         { c = *++p; }
515         while (isident(c) || !isascii(c) && scanUnicodeAlpha(p));
516         t.end = this.p = p;
518         auto id = tables.lookupIdentifier(t.text);
519         t.kind = id.kind;
520         t.ident = id;
521         assert(t.isKeyword || id.kind.In(T!"SpecialID", T!"Identifier"));
523         if (kind == T!"SpecialID")
524           finalizeSpecialToken(t);
525         return;
526       }
528       /// Advances p if p[1] equals x.
529       bool next(cchar x)
530       {
531         return p[1] == x ? (++p, 1) : 0;
532       }
534       // Newline.
535       if (*p == '\n' || *p == '\r' && (next('\n'), true))
536         goto Lnewline;
538       assert(this.p == p);
539       if (isdigit(c))
540         return scanNumber(t);
542       switch (c)
543       {
544       // Cases are sorted roughly according to times of occurrence.
545       mixin(cases(",", "(", ")", ";", "{", "}", "[", "]", ":"));
546       case '.': /* .  .[0-9]  ..  ... */
547         if (next('.'))
548           kind = next('.') ? T!"..." : T!"..";
549         else if (isdigit(p[1]))
550           return (this.p = p), scanFloat(t);
551         else
552           kind = T!".";
553         goto Lcommon;
554       case '=': /* =  ==  => */
555         kind = next('=') ? T!"==" : (next('>') ? T!"=>" : T!"=");
556         goto Lcommon;
557       case '`':
558         return scanRawString(t);
559       case '"':
560         return scanNormalString(t);
561       version(D1)
562       { // Only in D1.
563       case '\\':
564         return scanEscapeString(t);
565       }
566       case '\'':
567         return scanCharacter(t);
568       case '/':
569         switch (*++p)
570         {
571         case '=':
572           kind = T!"/=";
573           goto Lcommon;
574         case '+':
575           return (this.p = p), scanNestedComment(t);
576         case '*':
577           return (this.p = p), scanBlockComment(t);
578         case '/': // LineComment.
579           while (!isEndOfLine(++p))
580             isascii(*p) || decodeUTF8(p);
581           kind = T!"Comment";
582           goto Lreturn;
583         default:
584           kind = T!"/";
585           goto Lreturn;
586         }
587         assert(0);
588       case '>': /* >  >=  >>  >>=  >>>  >>>= */
589         switch (*++p)
590         {
591         case '=':
592           kind = T!">=";
593           goto Lcommon;
594         case '>':
595           if (next('>'))
596             kind = next('=') ? T!">>>=" : T!">>>";
597           else
598             kind = next('=') ? T!">>=" : T!">>";
599           goto Lcommon;
600         default:
601           kind = T!">";
602           goto Lreturn;
603         }
604         assert(0);
605       case '<': /* <  <=  <>  <>=  <<  <<= */
606         switch (*++p)
607         {
608         case '=':
609           kind = T!"<=";
610           goto Lcommon;
611         case '<':
612           kind = next('=') ? T!"<<=" : T!"<<";
613           goto Lcommon;
614         case '>':
615           kind = next('=') ? T!"<>=" : T!"<>";
616           goto Lcommon;
617         default:
618           kind = T!"<";
619           goto Lreturn;
620         }
621         assert(0);
622       case '!': /* !  !<  !>  !<=  !>=  !<>  !<>= */
623         switch (*++p)
624         {
625         case '<':
626           if (next('>'))
627             kind = next('=') ? T!"!<>=" : T!"!<>";
628           else
629             kind = next('=') ? T!"!<=" : T!"!<";
630           goto Lcommon;
631         case '>':
632           kind = next('=') ? T!"!>=" : T!"!>";
633           goto Lcommon;
634         case '=':
635           kind = T!"!=";
636           goto Lcommon;
637         default:
638           kind = T!"!";
639           goto Lreturn;
640         }
641         assert(0);
642       case '|': /* |  ||  |= */
643         kind = next('=') ? T!"|=" : (next('|') ? T!"||" : T!"|");
644         goto Lcommon;
645       case '&': /* &  &&  &= */
646         kind = next('=') ? T!"&=" : (next('&') ? T!"&&" : T!"&");
647         goto Lcommon;
648       case '+': /* +  ++  += */
649         kind = next('=') ? T!"+=" : (next('+') ? T!"++" : T!"+");
650         goto Lcommon;
651       case '-': /* -  --  -= */
652         kind = next('=') ? T!"-=" : (next('-') ? T!"--" : T!"-");
653         goto Lcommon;
654       case '~': /* ~  ~= */
655         kind = next('=') ? T!"~=" : T!"~";
656         goto Lcommon;
657       case '*': /* *  *= */
658         kind = next('=') ? T!"*=" : T!"*";
659         goto Lcommon;
660       version(D2)
661       {
662       case '^': /* ^  ^=  ^^  ^^= */
663         if (next('='))
664           kind = T!"^=";
665         else if (next('^'))
666           kind = next('=') ? T!"^^=" : T!"^^";
667         else
668           kind = T!"^";
669         goto Lcommon;
670       } // end of version(D2)
671       else
672       {
673       case '^': /* ^  ^= */
674         kind = next('=') ? T!"^=" : T!"^";
675         goto Lcommon;
676       }
677       case '%': /* %  %= */
678         kind = next('=') ? T!"%=" : T!"%";
679         goto Lcommon;
680       // Single character tokens:
681       mixin(cases("@","$","?"));
682       case '#':
683         assert(this.p == p);
684         return scanSpecialTokenSequence(t);
685       default:
686       }
688       // Check for EOF
689       if (isEOF(c))
690       {
691         assert(isEOF(*p), ""~*p);
692         kind = T!"EOF";
693         assert(t.start == p);
694         goto Lreturn;
695       }
697       assert(this.p == p);
698       if (!isascii(c) && isUniAlpha(c = decodeUTF8(p)))
699         goto Lidentifier;
701       if (isUnicodeNewlineChar(c))
702         goto Lnewline;
704       error(t.start, MID.IllegalCharacter, cast(dchar)c);
706       kind = T!"Illegal";
707       t.dchar_ = c;
708       goto Lcommon;
709     }
711   Lcommon:
712     ++p;
713   Lreturn:
714     t.kind = kind;
715     t.end = this.p = p;
716     return;
718   Lnewline:
719     setLineBegin(++p);
720     t.kind = T!"Newline";
721     t.nlval = lookupNewline();
722     t.end = this.p = p;
723     return;
724   }
726   /// Generates case statements for token strings.
727   /// ---
728   //// // case_("<") ->
729   /// case 60u:
730   ///   kind = T!"<";
731   ///   goto Lcommon;
732   /// ---
733   static char[] cases(string[] strs...)
734   {
735     char[] result;
736     foreach (str; strs)
737     {
738       char[] label_str = "Lcommon".dup;
739       if (str.length != 1) // Append length as a suffix.
740         label_str ~= '0' + cast(char)str.length;
741       result ~= `case castInt("`~str~`"): kind = T!"`~str~`"; `~
742                 "goto "~label_str~";\n";
743     }
744     return result;
745   }
746   //pragma(msg, cases("<", ">"));
748   /// An alternative scan method.
749   /// Profiling shows it's a bit slower.
750   public void scan_(Token* t)
751   in
752   {
753     assert(text.ptr <= p && p < end);
754   }
755   out
756   {
757     assert(text.ptr <= t.start && t.start < end, t.kind.toString);
758     assert(text.ptr <= t.end && t.end <= end, t.kind.toString);
759     assert(t.kind != T!"Invalid", t.text);
760   }
761   body
762   {
763     TOK kind; // The token kind that will be assigned to t.kind.
764     auto p = this.p; // Incrementing a stack variable is faster.
765     // Scan whitespace.
766     if (isspace(*p))
767     {
768       t.ws = p;
769       while (isspace(*++p))
770       {}
771     }
772     else
773       t.ws = null;
774     t.pvoid = null;
776     // Scan a token.
777     t.start = this.p = p;
779     uint c = *p;
781     assert(p == t.start);
782     // Check for ids first, as they occur the most often in source codes.
783     if (isidbeg(c))
784     {
785       c = *cast(ushort*)p;
786       if (c == chars_r)
787         return (this.p = ++p), scanRawString(t);
788       if (c == chars_x)
789         return scanHexString(t);
790       version(D2)
791       {
792       if (c == chars_q)
793         return scanDelimitedString(t);
794       if (c == chars_q2)
795         return scanTokenString(t);
796       }
798       // Scan an identifier.
799     Lidentifier:
800       do
801       { c = *++p; }
802       while (isident(c) || !isascii(c) && scanUnicodeAlpha(p));
803       t.end = this.p = p;
805       auto id = tables.lookupIdentifier(t.text);
806       t.kind = id.kind;
807       t.ident = id;
808       assert(t.isKeyword || id.kind.In(T!"SpecialID", T!"Identifier"));
810       if (kind == T!"SpecialID")
811         finalizeSpecialToken(t);
812       return;
813     }
815     if (isdigit(c))
816       return scanNumber(t);
819     // Thanks to the 4 zeros terminating the text,
820     // it is possible to look ahead 4 characters.
821     c = *cast(uint*)p;
823     // 4 character tokens.
824     switch (c)
825     {
826     mixin(cases(">>>=", "!<>="));
827     default:
828     }
830     version(BigEndian)
831     c >>>= 8;
832     else
833     c &= 0x00FFFFFF;
834     assert(p == t.start);
835     // 3 character tokens.
836     switch (c)
837     {
838     mixin(cases("<<=", ">>=", ">>>", "...",
839       "!<=", "!>=", "!<>", "<>=", "^^="));
840     case castInt(LS), castInt(PS):
841       p += 2;
842       goto Lnewline;
843     default:
844     }
846     version(BigEndian)
847     c >>>= 8;
848     else
849     c &= 0x0000FFFF;
850     assert(p == t.start);
851     // 2 character tokens.
852     switch (c)
853     {
854     case castInt("/+"):
855       this.p = ++p; // Skip /
856       return scanNestedComment(t);
857     case castInt("/*"):
858       this.p = ++p; // Skip /
859       return scanBlockComment(t);
860     case castInt("//"): // LineComment.
861       ++p; // Skip /
862       assert(*p == '/');
863       while (!isEndOfLine(++p))
864         isascii(*p) || decodeUTF8(p);
865       kind = T!"Comment";
866       goto Lreturn;
867     mixin(cases("<=", ">=", "<<", ">>", "==", "=>", "!=", "!<", "!>", "<>",
868       "..", "&&", "&=", "||", "|=", "++", "+=", "--", "-=", "*=", "/=", "%=",
869       "^=", "~=", "^^"));
870     case castInt("\r\n"):
871       ++p;
872       goto Lnewline;
873     default:
874     }
876     static TOK[127] char2TOK = [
877       '<': T!"<", '>': T!">", '^': T!"^", '!': T!"!",
878       '&': T!"&", '|': T!"|", '+': T!"+", '-': T!"-",
879       '=': T!"=", '~': T!"~", '*': T!"*", '/': T!"/",
880       '%': T!"%", '(': T!"(", ')': T!")", '[': T!"[",
881       ']': T!"]", '{': T!"{", '}': T!"}", ':': T!":",
882       ';': T!";", '?': T!"?", ',': T!",", '$': T!"$",
883       '@': T!"@"
884     ];
886     version(BigEndian)
887     c >>>= 8;
888     else
889     c &= 0x000000FF;
890     assert(p == t.start);
891     assert(*p == c, Format("p={0},c={1}", *p, cast(dchar)c));
892     // 1 character tokens.
893     // TODO: consider storing the token type in ptable.
894     if (c < 127 && (kind = char2TOK[c]) != 0)
895       goto Lcommon;
897     assert(this.p == p);
898     switch (c)
899     {
900     case '\r', '\n':
901       goto Lnewline;
902     case '\'':
903       return scanCharacter(t);
904     case '`':
905       return scanRawString(t);
906     case '"':
907       return scanNormalString(t);
908     version(D2)
909     {}
910     else { // Only in D1.
911     case '\\':
912       return scanEscapeString(t);
913     }
914     case '.':
915       if (isdigit(p[1]))
916         return (this.p = p), scanFloat(t);
917       kind = T!".";
918       ++p;
919       goto Lreturn;
920     case '#':
921       assert(this.p == p);
922       return scanSpecialTokenSequence(t);
923     default:
924     }
926     assert(p == t.start);
927     assert(*p == c);
929     // Check for EOF
930     if (isEOF(c))
931     {
932       assert(isEOF(*p), *p~"");
933       kind = T!"EOF";
934       assert(t.start == p);
935       goto Lreturn;
936     }
938     if (!isascii(c) && isUniAlpha(c = decodeUTF8(p)))
939       goto Lidentifier;
941     error(t.start, MID.IllegalCharacter, cast(dchar)c);
943     kind = T!"Illegal";
944     t.dchar_ = c;
945     goto Lcommon;
947   Lcommon4:
948     ++p;
949   Lcommon3:
950     ++p;
951   Lcommon2:
952     ++p;
953   Lcommon:
954     ++p;
955   Lreturn:
956     t.kind = kind;
957     t.end = this.p = p;
958     return;
960   Lnewline:
961     setLineBegin(++p);
962     t.kind = T!"Newline";
963     t.nlval = lookupNewline();
964     t.end = this.p = p;
965     return;
966   }
968   /// Scans a block comment.
969   ///
970   /// $(BNF BlockComment := "/*" AnyChar* "*/")
971   void scanBlockComment(Token* t)
972   {
973     auto p = this.p;
974     assert((p-1)[0..2] == "/*");
975     auto tokenLine = this.lineLoc;
976   Loop:
977     while (1)
978       switch (*++p)
979       {
980       case '*':
981         if (p[1] != '/')
982           continue;
983         p += 2;
984         break Loop;
985       case '\r':
986         if (p[1] == '\n')
987           ++p;
988         goto case;
989       case '\n':
990         setLineBegin(p+1);
991         break;
992       default:
993         if (!isascii(*p))
994         {
995           if (isUnicodeNewlineChar(decodeUTF8(p)))
996             goto case '\n';
997         }
998         else if (isEOF(*p)) {
999           error(tokenLine, t.start, MID.UnterminatedBlockComment);
1000           break Loop;
1001         }
1002       }
1003     t.kind = T!"Comment";
1004     t.end = this.p = p;
1005     return;
1006   }
1008   /// Scans a nested comment.
1009   ///
1010   /// $(BNF NestedComment := "/+" (NestedComment | AnyChar)* "+/")
1011   void scanNestedComment(Token* t)
1012   {
1013     auto p = this.p;
1014     assert((p-1)[0..2] == "/+");
1015     auto tokenLine = this.lineLoc;
1016     uint level = 1;
1017   Loop:
1018     while (1)
1019       switch (*++p)
1020       {
1021       case '/':
1022         if (p[1] == '+')
1023           ++p, ++level;
1024         continue;
1025       case '+':
1026         if (p[1] != '/')
1027           continue;
1028         ++p;
1029         if (--level != 0)
1030           continue;
1031         ++p;
1032         break Loop;
1033       case '\r':
1034         if (p[1] == '\n')
1035           ++p;
1036         goto case;
1037       case '\n':
1038         setLineBegin(p+1);
1039         break;
1040       default:
1041         if (!isascii(*p))
1042         {
1043           if (isUnicodeNewlineChar(decodeUTF8(p)))
1044             goto case '\n';
1045         }
1046         else if (isEOF(*p)) {
1047           error(tokenLine, t.start, MID.UnterminatedNestedComment);
1048           break Loop;
1049         }
1050       }
1051     t.kind = T!"Comment";
1052     t.end = this.p = p;
1053     return;
1054   }
1056   /// Scans the postfix character of a string literal.
1057   ///
1058   /// $(BNF PostfixChar := "c" | "w" | "d")
1059   static char scanPostfix(ref cchar* p)
1060   {
1061     assert(p[-1].In('"', '`', '}'));
1062     return (*p).In('c', 'w', 'd') ? *p++ : '\0';
1063   }
1065   /// Scans a normal string literal.
1066   ///
1067   /// $(BNF NormalStringLiteral := '"' (EscapeSequence | AnyChar)* '"')
1068   void scanNormalString(Token* t)
1069   {
1070     auto p = this.p;
1071     assert(*p == '"');
1072     auto tokenLine = this.lineLoc;
1073     t.kind = T!"String";
1074     auto value = getBuffer();
1075     auto prev = ++p; // Skip '"'. prev is used to copy chunks to value.
1076     cchar* prev2;
1078     while (*p != '"')
1079       switch (*p)
1080       {
1081       case '\\':
1082         if (prev != p) value ~= slice(prev, p);
1083         bool isBinary;
1084         auto c = scanEscapeSequence(p, isBinary);
1085         if (isascii(c) || isBinary)
1086           value ~= cast(char)c;
1087         else
1088           encodeUTF8(value, c);
1089         prev = p;
1090         break;
1091       case '\r':
1092         prev2 = p;
1093         if (p[1] == '\n')
1094           ++p;
1095       LconvertNewline:
1096         value ~= slice(prev, prev2 + 1); // +1 is for '\n'.
1097         *(value.cur-1) = '\n'; // Convert Newline to '\n'.
1098         prev = p+1;
1099         goto case;
1100       case '\n':
1101         setLineBegin(++p);
1102         break;
1103       case 0, _Z_:
1104         error(tokenLine, t.start, MID.UnterminatedString);
1105         goto Lerror;
1106       default:
1107         if (!isascii(*p) && isUnicodeNewlineChar(decodeUTF8(p)))
1108         {
1109           prev2 = p - 2;
1110           goto LconvertNewline;
1111         }
1112         ++p;
1113       }
1114     assert(*p == '"');
1116     {
1117     auto finalString = slice(prev, p);
1118     if (value.len)
1119       finalString = ((value ~= finalString), value[]); // Append previous string.
1120     ++p; // Skip '"'.
1121     t.strval = lookupString(finalString, scanPostfix(p));
1122     }
1123   Lerror:
1124     t.end = this.p = p;
1125     setBuffer(value);
1126     return;
1127   }
1129   /// Scans an escape string literal.
1130   ///
1131   /// $(BNF EscapeStringLiteral := EscapeSequence+ )
1132   void scanEscapeString(Token* t)
1133   {
1134     version(D1)
1135     {
1136     assert(*p == '\\');
1137     auto value = getBuffer();
1138     do
1139     {
1140       bool isBinary;
1141       auto c = scanEscapeSequence(p, isBinary);
1142       if (isascii(c) || isBinary)
1143         value ~= cast(char)c;
1144       else
1145         encodeUTF8(value, c);
1146     } while (*p == '\\');
1147     t.strval = lookupString(value, '\0');
1148     t.kind = T!"String";
1149     t.end = p;
1150     setBuffer(value);
1151     }
1152   }
1154   /// Scans a character literal.
1155   ///
1156   /// $(BNF CharacterLiteral := "'" (EscapeSequence | AnyChar) "'")
1157   void scanCharacter(Token* t)
1158   {
1159     assert(*p == '\'');
1160     t.kind = T!"Character";
1161     switch (*++p)
1162     {
1163     case '\\':
1164       bool notused;
1165       t.dchar_ = scanEscapeSequence(p, notused);
1166       break;
1167     case '\'':
1168       error(t.start, MID.EmptyCharacterLiteral);
1169       break;
1170     default:
1171       if (isEndOfLine(p))
1172         break;
1173       t.dchar_ = isascii(*p) ? *p : decodeUTF8(p);
1174       ++p;
1175     }
1177     if (*p == '\'')
1178       ++p;
1179     else
1180       error(t.start, MID.UnterminatedCharacterLiteral);
1181     t.end = p;
1182   }
1184   /// Scans a raw string literal.
1185   ///
1186   /// $(BNF RawStringLiteral := 'r"' AnyChar* '"' | "`" AnyChar* "`")
1187   void scanRawString(Token* t)
1188   {
1189     auto p = this.p;
1190     assert(*p == '`' || (p-1)[0..2] == `r"`);
1191     auto tokenLine = this.lineLoc;
1192     t.kind = T!"String";
1193     uint delim = *p;
1194     auto value = getBuffer();
1195     auto prev = ++p;
1196     cchar* prev2;
1198     while (*p != delim)
1199       switch (*p)
1200       {
1201       case '\r':
1202         prev2 = p;
1203         if (p[1] == '\n')
1204           ++p;
1205       LconvertNewline:
1206         value ~= slice(prev, prev2 + 1);
1207         *(value.cur-1) = '\n'; // Convert Newline to '\n'.
1208         prev = p+1;
1209         goto case;
1210       case '\n':
1211         setLineBegin(++p);
1212         break;
1213       case 0, _Z_:
1214         error(tokenLine, t.start, (delim == '"' ?
1215           MID.UnterminatedRawString : MID.UnterminatedBackQuoteString));
1216         goto Lerror;
1217       default:
1218         if (!isascii(*p) && isUnicodeNewlineChar(decodeUTF8(p)))
1219         {
1220           prev2 = p - 2;
1221           goto LconvertNewline;
1222         }
1223         ++p;
1224       }
1225     assert((*p).In('"', '`'));
1227     {
1228     auto finalString = slice(prev, p);
1229     if (value.len)
1230       finalString = ((value ~= finalString), value[]); // Append previous string.
1231     ++p; // Skip '"' or '`'.
1232     t.strval = lookupString(finalString, scanPostfix(p));
1233     }
1234   Lerror:
1235     t.end = this.p = p;
1236     setBuffer(value);
1237     return;
1238   }
1240   /// Scans a hexadecimal string literal.
1241   ///
1242   /// $(BNF HexStringLiteral := 'x"' (HexDigit HexDigit)* '"'
1243   ////HexDigit := [a-fA-F\d])
1244   void scanHexString(Token* t)
1245   {
1246     auto p = this.p;
1247     assert(p[0..2] == `x"`);
1248     t.kind = T!"String";
1250     auto tokenLine = this.lineLoc;
1252     auto value = getBuffer();
1253     ubyte h; // Current hex number.
1254     bool odd; // True if one hex digit has been scanned previously.
1256     ++p;
1257     assert(*p == '"');
1258     while (*++p != '"')
1259       switch (*p)
1260       {
1261       case '\r':
1262         if (p[1] == '\n')
1263           ++p;
1264         goto case;
1265       case '\n':
1266         setLineBegin(p+1);
1267         continue;
1268       default:
1269         dchar c = *p;
1270         if (hex2val(c))
1271         {
1272           if (odd)
1273             value ~= cast(ubyte)(h << 4 | c);
1274           else
1275             h = cast(ubyte)c;
1276           odd = !odd;
1277         }
1278         else if (isspace(c))
1279           continue; // Skip spaces.
1280         else if (isEOF(c)) {
1281           error(tokenLine, t.start, MID.UnterminatedHexString);
1282           goto Lerror;
1283         }
1284         else
1285         {
1286           auto errorAt = p;
1287           if (!isascii(c) && isUnicodeNewlineChar(c = decodeUTF8(p)))
1288             goto case '\n';
1289           error(errorAt, MID.NonHexCharInHexString, cast(dchar)c);
1290         }
1291       }
1292     if (odd)
1293       error(tokenLine, t.start, MID.OddNumberOfDigitsInHexString);
1294     ++p;
1295     t.strval = lookupString(value[], scanPostfix(p));
1296   Lerror:
1297     t.end = this.p = p;
1298     setBuffer(value);
1299     return;
1300   }
1302   /// Scans a delimited string literal.
1303   ///
1304   /// $(BNF
1305   ////DelimitedStringLiteral := 'q"' OpeningDelim AnyChar* MatchingDelim '"'
1306   ////OpeningDelim  := "[" | "(" | "{" | "&lt;" | Identifier EndOfLine
1307   ////MatchingDelim := "]" | ")" | "}" | "&gt;" | EndOfLine Identifier
1308   ////)
1309   void scanDelimitedString(Token* t)
1310   {
1311   version(D2)
1312   {
1313     auto p = this.p;
1314     assert(p[0..2] == `q"`);
1315     t.kind = T!"String";
1317     auto tokenLine = this.lineLoc;
1319     auto value = getBuffer();
1320     dchar nesting_delim, // '[', '(', '<', '{', or 0 if no nesting delimiter.
1321           closing_delim; // Will be ']', ')', '>', '},
1322                          // the first character of an identifier or
1323                          // any other Unicode/ASCII character.
1324     cstring str_delim; // Identifier delimiter.
1325     uint level = 1; // Counter for nestable delimiters.
1327     ++p; ++p; // Skip q"
1328     auto prev = p;
1329     cchar* prev2;
1330     dchar c = *p;
1331     // Scan the delimiter.
1332     switch (c)
1333     {
1334     case '(':
1335       nesting_delim = c;
1336       closing_delim = ')'; // c + 1
1337       break;
1338     case '[', '<', '{':
1339       nesting_delim = c;
1340       // Get to the closing counterpart. Feature of ASCII table.
1341       closing_delim = c + 2; // ']', '>' or '}'
1342       break;
1343     default:
1344       if (isNewline(p))
1345       {
1346         error(p, MID.DelimiterIsMissing);
1347         goto Lerror;
1348       }
1350       auto idbegin = p;
1351       closing_delim = isascii(c) ? c : decodeUTF8(p);
1353       if (isidbeg(closing_delim) || isUniAlpha(closing_delim))
1354       { // Scan: Identifier Newline
1355         do
1356         { c = *++p; }
1357         while (isident(c) || !isascii(c) && scanUnicodeAlpha(p));
1358         str_delim = slice(idbegin, p); // Scanned identifier delimiter.
1359         if (scanNewline(p))
1360           setLineBegin(p);
1361         else
1362           error(p, MID.NoNewlineAfterIdDelimiter, str_delim);
1363         --p; // Go back one because of "c = *++p;" in main loop.
1364       }
1365     }
1366     assert(closing_delim);
1368     if (isspace(closing_delim))
1369       error(p, MID.DelimiterIsWhitespace);
1371     bool checkStringDelim(cchar* p)
1372     { // Returns true if p points to the closing string delimiter.
1373       assert(str_delim.length != 0, ""~*p);
1374       return this.lineLoc.p is p && // Must be at the beginning of a new line.
1375         this.endX()-p >= str_delim.length && // Check remaining length.
1376         p[0..str_delim.length] == str_delim; // Compare.
1377     }
1379     // Scan the contents of the string.
1380     while (1)
1381       switch (c = *++p)
1382       {
1383       case '\r':
1384         prev2 = p;
1385         if (p[1] == '\n')
1386           ++p;
1387       LconvertNewline:
1388         value ~= slice(prev, prev2 + 1); // +1 is for '\n'.
1389         *(value.cur-1) = '\n'; // Convert Newline to '\n'.
1390         prev = p+1;
1391         goto case;
1392       case '\n':
1393         setLineBegin(p+1);
1394         break;
1395       case 0, _Z_:
1396         error(tokenLine, t.start, MID.UnterminatedDelimitedString);
1397         goto Lerror;
1398       default:
1399         prev2 = p;
1400         if (!isascii(c))
1401         { // Unicode branch.
1402           c = decodeUTF8(p);
1403           if (isUnicodeNewlineChar(c))
1404             goto LconvertNewline;
1405           if (c == closing_delim)
1406             if (str_delim.length)
1407             { // Matched first character of the string delimiter.
1408               if (checkStringDelim(prev2))
1409               {
1410                 p = prev2 + str_delim.length;
1411                 goto Lreturn2;
1412               }
1413             }
1414             else
1415             {
1416               assert(level == 1);
1417               --level;
1418               goto Lreturn;
1419             }
1420         }
1421         else // ASCII branch.
1422           if (c == nesting_delim)
1423             ++level;
1424           else if (c == closing_delim)
1425             if (str_delim.length)
1426             { // Matched first character of the string delimiter.
1427               if (checkStringDelim(p))
1428               {
1429                 p += str_delim.length;
1430                 goto Lreturn2;
1431               }
1432             }
1433             else if (--level == 0)
1434               goto Lreturn;
1435       }
1436   Lreturn: // Character delimiter.
1437     assert(c == closing_delim);
1438     assert(level == 0);
1439     ++p; // Skip closing delimiter.
1440   Lreturn2: // String delimiter.
1441     {
1442     auto finalString = slice(prev, prev2);
1443     if (value.len)
1444       finalString = ((value ~= finalString), value[]); // Append previous string.
1446     char postfix;
1447     if (*p == '"')
1448       postfix = scanPostfix(++p);
1449     else
1450     { // Pass str_delim or encode and pass closing_delim as a string.
1451       if (!str_delim.length)
1452       {
1453         char[] tmp;
1454         encode(tmp, closing_delim);
1455         str_delim = tmp;
1456       }
1457       error(p, MID.ExpectedDblQuoteAfterDelim, str_delim);
1458     }
1459     t.strval = lookupString(finalString, postfix);
1460     }
1461   Lerror:
1462     t.end = this.p = p;
1463     setBuffer(value);
1464   } // version(D2)
1465   }
1467   /// Scans a token string literal.
1468   ///
1469   /// $(BNF TokenStringLiteral := "q{" Token* "}")
1470   void scanTokenString(Token* t)
1471   {
1472   version(D2)
1473   {
1474     assert(p[0..2] == `q{`);
1475     t.kind = T!"String";
1477     auto tokenLine = this.lineLoc;
1479     ++inTokenString; // A guard against changes to 'this.hlinfo'.
1481     ++p; ++p; // Skip q{
1482     cchar* str_begin = p, str_end; // Inner string.
1483     TokenArray innerTokens; // The tokens inside this string.
1484     innerTokens.cap = 1;
1485     // Set to true, if '\r', LS, PS, or multiline tokens are encountered.
1486     bool convertNewlines;
1488     Token* new_t;
1489     uint level = 1; // Current nesting level of curly braces.
1490   Loop:
1491     while (1)
1492     {
1493       if (innerTokens.rem == 0)
1494         innerTokens.growX1_5();
1495       scan(new_t = innerTokens.cur++);
1496       switch (new_t.kind)
1497       {
1498       case T!"{":
1499         ++level;
1500         break;
1501       case T!"}":
1502         if (--level == 0)
1503           break Loop;
1504         break;
1505       case T!"String", T!"Comment":
1506         if (new_t.isMultiline())
1507           convertNewlines = true;
1508         break;
1509       case T!"Newline":
1510         if (*new_t.start != '\n')
1511           convertNewlines = true;
1512         break;
1513       case T!"EOF":
1514         error(tokenLine, t.start, MID.UnterminatedTokenString);
1515         this.p = new_t.ws ? new_t.ws : new_t.start; // Reset.
1516         break Loop;
1517       default:
1518       }
1519     }
1520     assert(new_t.kind.In(T!"}", T!"EOF"));
1522     char postfix;
1523     if (new_t.kind == T!"EOF")
1524       str_end = t.end = p;
1525     else
1526     {
1527       str_end = p-1;
1528       postfix = scanPostfix(p);
1529       t.end = p;
1530     }
1531     *new_t = Token.init; // Terminate with a "0-token".
1533     auto value = slice(str_begin, str_end);
1534     // Convert newlines to '\n'.
1535     if (convertNewlines)
1536     { // Copy the value and convert the newlines.
1537       auto tmp = getBuffer();
1538       tmp.len = value.length;
1539       auto q = str_begin; // Reader.
1540       auto s = tmp.ptr; // Writer.
1541       for (; q < str_end; ++q)
1542         switch (*q)
1543         {
1544         case '\r':
1545           if (q[1] == '\n')
1546             ++q;
1547           goto case;
1548         case '\n':
1549           assert(isNewlineEnd(q));
1550           *s++ = '\n'; // Convert Newline to '\n'.
1551           break;
1552         default:
1553           if (isUnicodeNewline(q))
1554           {
1555             ++q; ++q;
1556             goto case '\n';
1557           }
1558           *s++ = *q; // Copy current character.
1559         }
1560       tmp.len = s - tmp.ptr;
1561       value = tmp[];
1562       setBuffer(tmp);
1563     }
1565     auto strval = new_!(StringValue);
1566     strval.str = lookupString(cast(cbinstr)value);
1567     strval.pf = postfix;
1568     strval.tokens = innerTokens.ptr;
1569     t.strval = strval;
1571     --inTokenString;
1572   } // version(D2)
1573   }
1575   /// Scans an escape sequence.
1576   ///
1577   /// $(BNF
1578   ////EscapeSequence := "\\" (BinaryEsc | UnicodeEsc | CEsc | HTMLEsc)
1579   ////BinaryEsc := Octal{1,3} | "x" Hex{2}
1580   ////UnicodeEsc := "u" Hex{4} | "U" Hex{8}
1581   ////CEsc := "'" | '"' | "?" | "\\" | "a" | "b" | "f" | "n" | "r" | "t" | "v"
1582   ////HTMLEsc := "&" EntityName ";"
1583   ////EntityName := [a-zA-Z] [a-zA-Z\d]*
1584   ////)
1585   /// Params:
1586   ///   ref_p = Used to scan the sequence.
1587   ///   isBinary = Set to true for octal and hexadecimal escapes.
1588   /// Returns: The escape value.
1589   dchar scanEscapeSequence(ref cchar* ref_p, out bool isBinary)
1590   out(result)
1591   { assert(isValidChar(result)); }
1592   body
1593   {
1594     auto p = ref_p;
1595     assert(*p == '\\');
1596     // Used for error reporting.
1597     MID mid;
1598     cstring err_arg;
1600     ++p; // Skip '\\'.
1601     dchar c = char2ev(*p); // Table lookup.
1602     if (c)
1603     {
1604       ++p;
1605       goto Lreturn;
1606     }
1608     switch (*p)
1609     {
1610     uint loopCounter;
1612     case 'x':
1613       isBinary = true;
1614       loopCounter = 1;
1615     case_Unicode:
1616       assert(c == 0 && loopCounter.In(1, 2, 4));
1617       mid = MID.InsufficientHexDigits;
1618       while (loopCounter--)
1619       { // Decode two hex digits.
1620         dchar x = *++p;
1621         if (!hex2val(x))
1622           goto Lerror; // Not a hexdigit.
1623         c = c << 4 | x;
1624         x = *++p;
1625         if (!hex2val(x))
1626           goto Lerror;
1627         c = c << 4 | x;
1628       }
1629       ++p;
1630       if (!isValidChar(c))
1631       {
1632         mid = MID.InvalidUnicodeEscapeSequence;
1633         goto Lerror;
1634       }
1635       break;
1636     case 'u':
1637       loopCounter = 2;
1638       goto case_Unicode;
1639     case 'U':
1640       loopCounter = 4;
1641       goto case_Unicode;
1642     default:
1643       size_t x = *p - '0';
1644       if (x < 8)
1645       { // Octal sequence.
1646         isBinary = true;
1647         assert(c == 0);
1648         c = x;
1649         if ((x = *++p - '0') >= 8)
1650           break;
1651         c = c * 8 + x;
1652         if ((x = *++p - '0') >= 8)
1653           break;
1654         c = c * 8 + x;
1655         ++p;
1656         if (c <= 0xFF)
1657           break;
1658         mid = MID.InvalidOctalEscapeSequence;
1659       }
1660       else if (*p == '&')
1661       {
1662         if (isalpha(*++p))
1663         {
1664           auto begin = p;
1665           while (isalnum(*++p))
1666           {}
1668           if (*p == ';')
1669           { // Pass entity excluding '&' and ';'.
1670             c = entity2Unicode(slice(begin, p));
1671             ++p; // Skip ;
1672             if (c)
1673               goto Lreturn; // Return valid escape value.
1674             else
1675               mid = MID.UndefinedHTMLEntity;
1676           }
1677           else
1678             mid = MID.UnterminatedHTMLEntity;
1679         }
1680         else
1681           mid = MID.InvalidBeginHTMLEntity;
1682       }
1683       else if (isEndOfLine(p)) {
1684         mid = MID.UndefinedEscapeSequence;
1685         err_arg = isEOF(*p) ? `\EOF` : `\NewLine`;
1686       }
1687       else
1688       {
1689         auto tmp = "\\".dup;
1690         // TODO: check for non-printable character?
1691         encode(tmp, isascii(*p) ? *p : decodeUTF8(p));
1692         err_arg = tmp;
1693         ++p;
1694         mid = MID.UndefinedEscapeSequence;
1695       }
1696       goto Lerror;
1697     }
1699   Lreturn:
1700     ref_p = p;
1701     return c;
1703   Lerror:
1704     if (!err_arg.length)
1705       err_arg = slice(ref_p, p);
1706     error(ref_p, mid, err_arg);
1707     ref_p = p; // Is at the beginning of the sequence. Update now.
1708     return REPLACEMENT_CHAR; // Error: return replacement character.
1709   }
1711   /// Scans a number literal.
1712   ///
1713   /// $(BNF
1714   ////IntegerLiteral := (Dec | Hex | Bin | Oct) Suffix?
1715   ////Dec := "0" | [1-9] [\d_]*
1716   ////Hex := "0" [xX] "_"* HexDigits
1717   ////Bin := "0" [bB] "_"* [01] [01_]*
1718   ////Oct := "0" [0-7_]*
1719   ////Suffix := "L" [uU]? | [uU] "L"?
1720   ////)
1721   /// Invalid: "0b_", "0x_", "._" etc.
1722   void scanNumber(Token* t)
1723   {
1724     assert(isdigit(*p));
1725     auto p = this.p;
1726     ulong ulong_; // The integer value.
1727     bool overflow; // True if an overflow was detected.
1728     bool isDecimal; // True for Dec literals.
1729     bool hasDecimalDigits; // To check for 8s and 9s in octal numbers.
1730     size_t digits; // Used to detect overflow in hex/bin numbers.
1731     size_t x; // Current digit value.
1733     bool isfloat(char c)
1734     { // True if the decimal point '.' is not followed by:
1735       return c != '.' && !isidbeg(c) && isascii(c);
1736     }
1738     if (*p != '0')
1739       goto LscanInteger;
1740     ++p; // Skip zero.
1741     // Check for xX bB ...
1742     switch (*p)
1743     {
1744     case 'x','X':
1745       goto LscanHex;
1746     case 'b','B':
1747       goto LscanBinary;
1748     case 'L':
1749       if (p[1] == 'i')
1750         goto LscanFloat; // 0Li
1751       break; // 0L
1752     case '.':
1753       if (!isfloat(p[1]))
1754         break;
1755       goto LscanFloat; // 0.[0-9]
1756     case 'i','f','F', // Imaginary and float literal suffixes.
1757          'e', 'E':    // Float exponent.
1758       goto LscanFloat;
1759     default:
1760       if (*p == '_')
1761         goto LscanOctal; // 0_
1762       else if ((x = *p - '0') < 10)
1763         if (x > 7)
1764           goto Loctal_hasDecimalDigits; // 08 or 09
1765         else
1766           goto Loctal_scannedFirstDigit; // 0[0-7]
1767     }
1769     // Number 0
1770     assert(p[-1] == '0' && !isdigi_(*p) && ulong_ == 0);
1771     isDecimal = true;
1772     goto Lfinalize;
1774   LscanInteger:
1775     assert(*p != '0' && isdigit(*p));
1776     isDecimal = true;
1777     for (; 1; ++p)
1778       if ((x = *p - '0') < 10)
1779       {
1780         if (ulong_ < ulong.max/10 || (ulong_ == ulong.max/10 && x < 6))
1781           ulong_ = ulong_ * 10 + x;
1782         else
1783         { // Overflow: skip following digits.
1784           overflow = true;
1785           while (isdigit(*++p))
1786           {}
1787           break;
1788         }
1789       }
1790       else if (*p != '_')
1791         break;
1793     // The number could be a float, so check overflow below.
1794     switch (*p)
1795     {
1796     case '.':
1797       if (isfloat(p[1]))
1798         goto LscanFloat;
1799       break;
1800     case 'L':
1801       if (p[1] != 'i')
1802         break;
1803       goto LscanFloat;
1804     case 'i', 'f', 'F', 'e', 'E':
1805       goto LscanFloat;
1806     default:
1807     }
1809     if (overflow)
1810       error(t.start, MID.OverflowDecimalNumber);
1812     assert(isdigi_(p[-1]) && !isdigi_(*p));
1813     goto Lfinalize;
1815   LscanHex:
1816     assert(digits == 0);
1817     assert((*p).In('x', 'X'));
1818     while (1)
1819     {
1820       x = *++p;
1821       if (hex2val(x))
1822       {
1823         ulong_ = ulong_ << 4 | x;
1824         ++digits;
1825       }
1826       else if (*p != '_')
1827         break;
1828     }
1830     assert((ishexa_(p[-1]) || p[-1].In('x', 'X')) && !ishexa_(*p));
1832     switch (*p)
1833     {
1834     case '.':
1835       if (!isfloat(p[1]))
1836         break;
1837       goto case;
1838     case 'p', 'P':
1839       this.p = p;
1840       return scanHexFloat(t);
1841     default:
1842     }
1844     if (digits == 0 || digits > 16)
1845       error(t.start,
1846         digits == 0 ? MID.NoDigitsInHexNumber : MID.OverflowHexNumber);
1848     goto Lfinalize;
1850   LscanBinary:
1851     assert(digits == 0);
1852     assert((*p).In('b', 'B'));
1853     while (1)
1854       if ((x = *++p - '0') < 2)
1855       {
1856         ++digits;
1857         ulong_ = ulong_ * 2 + x;
1858       }
1859       else if (*p != '_')
1860         break;
1862     if (digits == 0 || digits > 64)
1863       error(t.start,
1864         digits == 0 ? MID.NoDigitsInBinNumber : MID.OverflowBinaryNumber);
1866     assert(p[-1].In('0', '1', '_', 'b', 'B'), p[-1] ~ "");
1867     assert(!(*p).In('0', '1', '_'));
1868     goto Lfinalize;
1870   LscanOctal:
1871     assert(*p == '_');
1872     while (1)
1873       if ((x = *++p - '0') < 8)
1874       {
1875         if (ulong_ < ulong.max/2 || (ulong_ == ulong.max/2 && x < 2))
1876         Loctal_scannedFirstDigit:
1877           ulong_ = ulong_ * 8 + x;
1878         else
1879         { // Overflow: skip following digits.
1880           overflow = true;
1881           while (isoctal(*++p))
1882           {}
1883           break;
1884         }
1885       }
1886       else if (*p != '_')
1887         break;
1889     if (isdigit(*p))
1890     {
1891     Loctal_hasDecimalDigits:
1892       hasDecimalDigits = true;
1893       while (isdigit(*++p))
1894       {}
1895     }
1897     // The number could be a float, so check errors below.
1898     switch (*p)
1899     {
1900     case '.':
1901       if (isfloat(p[1]))
1902         goto LscanFloat;
1903       break;
1904     case 'L':
1905       if (p[1] != 'i')
1906         break;
1907       goto LscanFloat;
1908     case 'i', 'f', 'F', 'e', 'E':
1909       goto LscanFloat;
1910     default:
1911     }
1913     version(D2)
1914     {
1915     if (ulong_ >= 8 || hasDecimalDigits)
1916       error(t.start, MID.OctalNumbersDeprecated);
1917     }
1918     else
1919     {
1920     if (hasDecimalDigits)
1921       error(t.start, MID.OctalNumberHasDecimals);
1922     if (overflow)
1923       error(t.start, MID.OverflowOctalNumber);
1924     }
1925     //goto Lfinalize;
1927   Lfinalize:
1928     {
1929     enum Suffix
1930     {
1931       None     = 0,
1932       Unsigned = 1,
1933       Long     = 2
1934     }
1936     // Scan optional suffix: L, Lu, LU, u, uL, U or UL.
1937     Suffix suffix;
1938   Loop:
1939     while (1)
1940       switch (*p)
1941       {
1942       case 'L':
1943         if (suffix & Suffix.Long)
1944           break Loop;
1945         suffix |= Suffix.Long;
1946         ++p;
1947         continue;
1948       case 'u', 'U':
1949         if (suffix & Suffix.Unsigned)
1950           break Loop;
1951         suffix |= Suffix.Unsigned;
1952         ++p;
1953         continue;
1954       default:
1955         break Loop;
1956       }
1958     // Determine type of Integer.
1959     TOK kind;
1960     switch (suffix)
1961     {
1962     case Suffix.None:
1963       if (ulong_ & 0x8000_0000_0000_0000)
1964       {
1965         if (isDecimal)
1966           error(t.start, MID.OverflowDecimalSign);
1967         kind = T!"UInt64";
1968       }
1969       else if (ulong_ & 0xFFFF_FFFF_0000_0000)
1970         kind = T!"Int64";
1971       else if (ulong_ & 0x8000_0000)
1972         kind = isDecimal ? T!"Int64" : T!"UInt32";
1973       else
1974         kind = T!"Int32";
1975       break;
1976     case Suffix.Unsigned:
1977       if (ulong_ & 0xFFFF_FFFF_0000_0000)
1978         kind = T!"UInt64";
1979       else
1980         kind = T!"UInt32";
1981       break;
1982     case Suffix.Long:
1983       if (ulong_ & 0x8000_0000_0000_0000)
1984       {
1985         if (isDecimal)
1986           error(t.start, MID.OverflowDecimalSign);
1987         kind = T!"UInt64";
1988       }
1989       else
1990         kind = T!"Int64";
1991       break;
1992     case Suffix.Unsigned | Suffix.Long:
1993       kind = T!"UInt64";
1994       break;
1995     default:
1996       assert(0);
1997     }
1999     t.kind = kind;
2000     if (kind == T!"Int64" || kind == T!"UInt64")
2001     {
2002       version(X86_64)
2003       t.intval.ulong_ = ulong_;
2004       else
2005       t.intval = tables.lookupUlong(ulong_);
2006     }
2007     else
2008       t.uint_ = cast(uint)ulong_;
2009     t.end = this.p = p;
2010     return;
2011     }
2013   LscanFloat:
2014     this.p = p;
2015     scanFloat(t);
2016     return;
2017   }
2019   /// Returns a zero-terminated copy of the string where all
2020   /// underscores are removed.
2021   static char[] copySansUnderscores(cchar* begin, cchar* end)
2022   {
2023     auto s = String(begin, end + 1).dup;
2024     s[Neg(1)] = 0;
2025     return s.sub('_', "")[];
2026   }
2028   /// Scans a floating point number literal.
2029   ///
2030   /// $(BNF
2031   ////FloatLiteral := Float [fFL]? i?
2032   ////Float        := DecFloat | HexFloat
2033   ////DecFloat     := (DecDigits "." "_"* DecDigits? DecExponent?) |
2034   ////                ("." DecDigits DecExponent?)
2035   ////                (DecDigits DecExponent)
2036   ////DecExponent  := [eE] [+-]? DecDigits
2037   ////DecDigits    := \d [\d_]*
2038   ////)
2039   void scanFloat(Token* t)
2040   {
2041     auto p = this.p;
2042     if (*p == '.')
2043     {
2044       assert(p[1] != '.');
2045       // This function was called by scan() or scanNumber().
2046       while (isdigi_(*++p))
2047       {}
2048     }
2049     else // This function was called by scanNumber().
2050       assert((*p).In('i', 'f', 'F', 'e', 'E') || p[0..2] == "Li");
2052     // Scan exponent.
2053     if (*p == 'e' || *p == 'E')
2054     {
2055       ++p;
2056       if (*p == '-' || *p == '+')
2057         ++p;
2058       if (isdigit(*p))
2059         while (isdigi_(*++p))
2060         {}
2061       else
2062         error(p, MID.FloatExpMustStartWithDigit);
2063     }
2065     this.p = p;
2066     finalizeFloat(t, copySansUnderscores(t.start, p));
2067   }
2069   /// Scans a hexadecimal floating point number literal.
2070   /// $(BNF
2071   ////HexFloat := "0" [xX] (HexDigits? "." HexDigits | HexDigits) HexExponent
2072   ////HexExponent := [pP] [+-]? DecDigits
2073   ////HexDigits := [a-fA-F\d] [a-fA-F\d_]*
2074   ////)
2075   void scanHexFloat(Token* t)
2076   {
2077     auto p = this.p;
2078     assert((*p).In('.', 'p', 'P'));
2079     MID mid = MID.HexFloatExponentRequired;
2080     if (*p == '.')
2081       while (ishexa_(*++p))
2082       {}
2083     // Decimal exponent is required.
2084     if (*p != 'p' && *p != 'P')
2085       goto Lerror;
2086     // Scan exponent
2087     assert((*p).In('p', 'P'));
2088     ++p;
2089     if (*p == '+' || *p == '-')
2090       ++p;
2091     if (!isdigit(*p))
2092     {
2093       mid = MID.HexFloatExpMustStartWithDigit;
2094       goto Lerror;
2095     }
2096     while (isdigi_(*++p))
2097     {}
2099     this.p = p;
2100     finalizeFloat(t, copySansUnderscores(t.start, p));
2101     return;
2102   Lerror:
2103     t.kind = T!"Float32";
2104     t.end = this.p = p;
2105     error(p, mid);
2106   }
2108   /// Sets the value of the token.
2109   /// Params:
2110   ///   t = Receives the value.
2111   ///   float_string = The well-formed float number string.
2112   void finalizeFloat(Token* t, cstring float_string)
2113   {
2114     auto p = this.p;
2115     assert(float_string.length && float_string[$-1] == 0);
2116     // Finally check suffixes.
2117     TOK kind = void;
2118     if (*p == 'f' || *p == 'F')
2119       ++p, kind = T!"Float32";
2120     else if (*p == 'L')
2121       ++p, kind = T!"Float80";
2122     else
2123       kind = T!"Float64";
2125     if (*p == 'i')
2126     {
2127       ++p;
2128       kind += 3; // Switch to imaginary counterpart.
2129       assert(kind.In(T!"IFloat32", T!"IFloat64", T!"IFloat80"));
2130     }
2131     // TODO: test for overflow/underflow according to target platform.
2132     //       CompilationContext must be passed to Lexer for this.
2133     auto f = lookupFloat(float_string);
2134     if (f.isPInf())
2135       error(t.start, MID.OverflowFloatNumber);
2136     // else if (f.isNInf())
2137       // error(t.start, MID.UnderflowFloatNumber);
2138     // else if (f.isNaN())
2139       // error(t.start, MID.NaNFloat);
2140     t.mpfloat = f;
2141     t.kind = kind;
2142     t.end = this.p = p;
2143     return;
2144   }
2146   /// Scans a special token sequence.
2147   ///
2148   /// $(BNF SpecialTokenSequence := "#line" Integer Filespec? EndOfLine)
2149   void scanSpecialTokenSequence(Token* t)
2150   {
2151     auto p = this.p;
2152     assert(*p == '#');
2154     auto hlval = new_!(Token.HashLineValue);
2156     MID mid;
2157     cchar* errorAtColumn = p;
2158     cchar* tokenEnd = ++p;
2160     if (*cast(uint*)p != chars_line)
2161     {
2162       mid = MID.ExpectedIdentifierSTLine;
2163       goto Lerror;
2164     }
2166     { // Start of scanning code block.
2167     p += 4;
2168     tokenEnd = p;
2170     // TODO: #line58"path/file" is legal. Require spaces?
2171     //       State.Space could be used for that purpose.
2172     enum State
2173     { /+Space,+/ Integer, OptionalFilespec, End }
2175     State state = State.Integer;
2177     while (!isEndOfLine(p))
2178     {
2179       if (isspace(*p))
2180       {}
2181       else if (state == State.Integer)
2182       {
2183         if (!isdigit(*p))
2184         {
2185           errorAtColumn = p;
2186           mid = MID.ExpectedIntegerAfterSTLine;
2187           goto Lerror;
2188         }
2189         auto newtok = new_!(Token);
2190         hlval.lineNum = newtok;
2191         this.p = p;
2192         scan(newtok);
2193         tokenEnd = p = this.p;
2194         if (newtok.kind != T!"Int32" && newtok.kind != T!"UInt32")
2195         {
2196           errorAtColumn = newtok.start;
2197           mid = MID.ExpectedIntegerAfterSTLine;
2198           goto Lerror;
2199         }
2200         state = State.OptionalFilespec;
2201         continue;
2202       }
2203       else if (state == State.OptionalFilespec && *p == '"')
2204       {
2205         auto fs = hlval.filespec = new_!(Token);
2206         fs.start = p;
2207         fs.kind = T!"Filespec";
2208         // Skip until closing '"'.
2209         while (*++p != '"' && !isEndOfLine(p))
2210           isascii(*p) || decodeUTF8(p);
2211         if (*p != '"')
2212         { // Error.
2213           errorAtColumn = fs.start;
2214           mid = MID.UnterminatedFilespec;
2215           fs.end = p;
2216           tokenEnd = p;
2217           goto Lerror;
2218         }
2219         auto str = slice(fs.start + 1, p); // Get string excluding "".
2220         fs.strval = lookupString(str, '\0');
2221         fs.end = tokenEnd = ++p;
2222         state = State.End;
2223         continue;
2224       }
2225       else/+ if (state == State.End)+/
2226       {
2227         errorAtColumn = tokenEnd;
2228         mid = MID.UnterminatedSpecialToken;
2229         goto Lerror;
2230       }
2231       ++p;
2232     }
2233     assert(isEndOfLine(p));
2235     if (state == State.Integer)
2236     {
2237       errorAtColumn = p;
2238       mid = MID.ExpectedIntegerAfterSTLine;
2239       goto Lerror;
2240     }
2241     } // End of scanning code block.
2243     // Evaluate #line only when not in token string.
2244     if (!inTokenString && hlval.lineNum)
2245     {
2246       if (!hlinfo)
2247       {
2248         hlinfo = new_!(Token.HashLineInfo);
2249         hlinfo.path = srcText.filePath;
2250       }
2251       hlinfo.setLineNum(this.lineNum, hlval.lineNum.sizet_);
2252       if (hlval.filespec)
2253         hlinfo.path = cast(cstring)hlval.filespec.strval.str;
2254     }
2256     if (0) // Only issue an error if jumped here.
2257     Lerror:
2258       error(errorAtColumn, mid);
2260     t.kind = TOK.HashLine;
2261     t.hlval = hlval;
2262     t.end = this.p = tokenEnd;
2263     return;
2264   }
2266   /// Returns the error line number.
2267   size_t errorLineNumber(size_t lineNum)
2268   {
2269     if (hlinfo)
2270       lineNum -= hlinfo.lineNum;
2271     return lineNum;
2272   }
2274   /// Returns the file path for error messages.
2275   cstring errorFilePath()
2276   {
2277     return hlinfo ? hlinfo.path : srcText.filePath;
2278   }
2280   /// Forwards error parameters.
2281   void error(cchar* columnPos, MID mid, ...)
2282   {
2283     error(_arguments, _argptr, this.lineLoc, columnPos, diag.bundle.msg(mid));
2284   }
2286   /// ditto
2287   void error(LineLoc line, cchar* columnPos, MID mid, ...)
2288   {
2289     error(_arguments, _argptr, line, columnPos, diag.bundle.msg(mid));
2290   }
2292   /// Creates an error report and appends it to a list.
2293   /// Params:
2294   ///   line = The line number and pointer to the first character of a line.
2295   ///   columnPos = Points to the character where the error is located.
2296   ///   msg = The error message.
2297   void error(TypeInfo[] _arguments, va_list _argptr,
2298     LineLoc line, cchar* columnPos, cstring msg)
2299   {
2300     line.n = this.errorLineNumber(line.n);
2301     auto errorPath = errorFilePath();
2302     auto location = new Location(errorPath, line.n, line.p, columnPos);
2303     msg = diag.format(_arguments, _argptr, msg);
2304     auto error = new LexerError(location, msg);
2305     errors ~= error;
2306     diag ~= error;
2307   }
2309   /// Returns true if the current character to be decoded is
2310   /// a Unicode alpha character.
2311   /// Params:
2312   ///   ref_p = Is set to the last trail byte if true is returned.
2313   static bool scanUnicodeAlpha(ref cchar* ref_p)
2314   {
2315     auto p = ref_p;
2316     assert(!isascii(*p),
2317       "check for ASCII char before calling scanUnicodeAlpha().");
2318     dchar d = *p;
2319     ++p; // Move to second byte.
2320     // Error if second byte is not a trail byte.
2321     if (!isTrailByte(*p))
2322       return false;
2323     // Check for overlong sequences.
2324     if (d.In(0xE0, 0xF0, 0xF8, 0xFC) && (*p & d) == 0x80 ||
2325         (d & 0xFE) == 0xC0) // 1100000x
2326       return false;
2327     const string checkNextByte = "if (!isTrailByte(*++p))"
2328                                  "  return false;";
2329     const string appendSixBits = "d = (d << 6) | *p & 0b0011_1111;";
2330     // Decode
2331     if ((d & 0b1110_0000) == 0b1100_0000)
2332     {
2333       d &= 0b0001_1111;
2334       mixin(appendSixBits);
2335     }
2336     else if ((d & 0b1111_0000) == 0b1110_0000)
2337     {
2338       d &= 0b0000_1111;
2339       mixin(appendSixBits ~
2340             checkNextByte ~ appendSixBits);
2341     }
2342     else if ((d & 0b1111_1000) == 0b1111_0000)
2343     {
2344       d &= 0b0000_0111;
2345       mixin(appendSixBits ~
2346             checkNextByte ~ appendSixBits ~
2347             checkNextByte ~ appendSixBits);
2348     }
2349     else
2350       return false;
2352     assert(isTrailByte(*p));
2353     if (!isValidChar(d) || !isUniAlpha(d))
2354       return false;
2355     // Only advance pointer if this is a Unicode alpha character.
2356     ref_p = p;
2357     return true;
2358   }
2360   /// Decodes the next UTF-8 sequence.
2361   ///
2362   /// Params:
2363   ///   ref_p = Set to the last trail byte.
2364   dchar decodeUTF8(ref cchar* ref_p)
2365   {
2366     auto p = ref_p;
2367     assert(!isascii(*p), "check for ASCII char before calling decodeUTF8().");
2368     dchar d = *p;
2370     ++p; // Move to second byte.
2371     // Error if second byte is not a trail byte.
2372     if (!isTrailByte(*p))
2373       goto Lerror2;
2375     // Check for overlong sequences.
2376     if (d.In(0xE0, 0xF0, 0xF8, 0xFC) && (*p & d) == 0x80 ||
2377         (d & 0xFE) == 0xC0) // 1100000x
2378       goto Lerror;
2380     enum checkNextByte = "if (!isTrailByte(*++p))"
2381                                  "  goto Lerror2;";
2382     enum appendSixBits = "d = (d << 6) | *p & 0b0011_1111;";
2384     // See how many bytes need to be decoded.
2385     if ((d & 0b1110_0000) == 0b1100_0000)
2386     { // 110xxxxx 10xxxxxx
2387       d &= 0b0001_1111;
2388       goto L2Bytes;
2389     }
2390     else if ((d & 0b1111_0000) == 0b1110_0000)
2391     { // 1110xxxx 10xxxxxx 10xxxxxx
2392       d &= 0b0000_1111;
2393       goto L3Bytes;
2394     }
2395     else if ((d & 0b1111_1000) == 0b1111_0000)
2396     { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2397       d &= 0b0000_0111;
2398       goto L4Bytes;
2399     }
2400     else
2401       // 5 and 6 byte UTF-8 sequences are not allowed yet.
2402       // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2403       // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2404       goto Lerror;
2406     // Decode the bytes now.
2407   L4Bytes:
2408     mixin(appendSixBits);
2409     mixin(checkNextByte);
2410   L3Bytes:
2411     mixin(appendSixBits);
2412     mixin(checkNextByte);
2413   L2Bytes:
2414     mixin(appendSixBits);
2416     assert(isTrailByte(*p));
2418     if (!isValidChar(d))
2419     {
2420     Lerror:
2421       // Three cases:
2422       // *) the UTF-8 sequence was successfully decoded but the resulting
2423       //    character is invalid.
2424       //    p points to last trail byte in the sequence.
2425       // *) the UTF-8 sequence is overlong.
2426       //    p points to second byte in the sequence.
2427       // *) the UTF-8 sequence has more than 4 bytes or starts with
2428       //    a trail byte.
2429       //    p points to second byte in the sequence.
2430       assert(isTrailByte(*p));
2431       // Move to next ASCII character or lead byte of a UTF-8 sequence.
2432       while (*p && !isValidLead(*p))
2433         ++p;
2434       --p;
2435       assert(!isTrailByte(p[1]) && p < this.endX());
2436     Lerror2:
2437       d = REPLACEMENT_CHAR;
2438       error(ref_p, MID.InvalidUTF8Sequence, formatBytes(ref_p, p));
2439     }
2440     // Advance the pointer and return.
2441     ref_p = p;
2442     return d;
2443   }
2445   /// Encodes the character d and appends it to str.
2446   static void encodeUTF8(ref CharArray str, dchar d)
2447   {
2448     assert(!isascii(d), "check for ASCII char before calling encodeUTF8().");
2449     assert(isValidChar(d), "cannot encode invalid char in encodeUTF8().");
2451     auto count = d < 0x800 ? 2 : (d < 0x10000 ? 3 : 4);
2452     if (count > str.rem) // Not enough space?
2453       str.rem = count;
2454     auto p = str.cur;
2455     str.cur += count;
2456     if (d < 0x800)
2457     {
2458       p[0] = 0xC0 | cast(char)(d >> 6);
2459       p[1] = 0x80 | (d & 0x3F);
2460     }
2461     else if (d < 0x10000)
2462     {
2463       p[0] = 0xE0 | cast(char)(d >> 12);
2464       p[1] = 0x80 | ((d >> 6) & 0x3F);
2465       p[2] = 0x80 | (d & 0x3F);
2466     }
2467     else if (d < 0x200000)
2468     {
2469       p[0] = 0xF0 | (d >> 18);
2470       p[1] = 0x80 | ((d >> 12) & 0x3F);
2471       p[2] = 0x80 | ((d >> 6) & 0x3F);
2472       p[3] = 0x80 | (d & 0x3F);
2473     }
2474     else
2475      assert(0);
2476   }
2478   /// Formats the bytes between start and end (excluding end.)
2479   /// Returns: e.g.: "abc" -> "\x61\x62\x63"
2480   static cstring formatBytes(cchar* start, cchar* end)
2481   {
2482     const formatLen = 4; // `\xXX`.length
2483     const H = "0123456789ABCDEF"; // Hex numerals.
2484     auto strLen = end-start;
2485     char[] result = new char[strLen*formatLen]; // Allocate space.
2486     char* p = result.ptr;
2487     foreach (c; start[0..strLen])
2488       (*p++ = '\\'), (*p++ = 'x'), (*p++ = H[c>>4]), (*p++ = H[c&0x0F]);
2489     assert(p is result.ptr+result.length);
2490     return result;
2491   }
2493   /// Searches for an invalid UTF-8 sequence in str.
2494   /// Returns: a formatted string of the invalid sequence (e.g. "\xC0\x80").
2495   static cstring findInvalidUTF8Sequence(cbinstr bstr)
2496   {
2497     auto str = cast(cstring)bstr;
2498     auto p = str.ptr, end = p + str.length;
2499     while (p < end)
2500       if (decode(p, end) == ERROR_CHAR)
2501       {
2502         auto begin = p;
2503         // Skip trail-bytes.
2504         while (++p < end && !isValidLead(*p))
2505         {}
2506         return Lexer.formatBytes(begin, p);
2507       }
2508     assert(p == end);
2509     return null;
2510   }
2511 } // End of Lexer
2513 /// Tests the lexer with a list of tokens.
2514 void testLexer()
2515 {
2516   scope msg = new UnittestMsg("Testing class Lexer.");
2517   struct Pair
2518   {
2519     string tokenText;
2520     TOK kind;
2521   }
2522   static Pair[] pairs = [
2523     {"#!äöüß",  TOK.Shebang},       {"\n",      TOK.Newline},
2524     {"//çay",   TOK.Comment},       {"\n",      TOK.Newline},
2525                                     {"&",       TOK.Amp},
2526     {"/*çağ*/", TOK.Comment},       {"&&",      TOK.Amp2},
2527     {"/+çak+/", TOK.Comment},       {"&=",      TOK.AmpEql},
2528     {">",       TOK.Greater},       {"+",       TOK.Plus},
2529     {">=",      TOK.GreaterEql},    {"++",      TOK.Plus2},
2530     {">>",      TOK.Greater2},      {"+=",      TOK.PlusEql},
2531     {">>=",     TOK.Greater2Eql},   {"-",       TOK.Minus},
2532     {">>>",     TOK.Greater3},      {"--",      TOK.Minus2},
2533     {">>>=",    TOK.Greater3Eql},   {"-=",      TOK.MinusEql},
2534     {"<",       TOK.Less},          {"=",       TOK.Equal},
2535     {"<=",      TOK.LessEql},       {"==",      TOK.Equal2},
2536     {"<>",      TOK.LorG},          {"~",       TOK.Tilde},
2537     {"<>=",     TOK.LorEorG},       {"~=",      TOK.TildeEql},
2538     {"<<",      TOK.Less2},         {"*",       TOK.Star},
2539     {"<<=",     TOK.Less2Eql},      {"*=",      TOK.StarEql},
2540     {"!",       TOK.Exclaim},       {"/",       TOK.Slash},
2541     {"!=",      TOK.ExclaimEql},    {"/=",      TOK.SlashEql},
2542     {"!<",      TOK.UorGorE},       {"^",       TOK.Caret},
2543     {"!>",      TOK.UorLorE},       {"^=",      TOK.CaretEql},
2544     {"!<=",     TOK.UorG},          {"%",       TOK.Percent},
2545     {"!>=",     TOK.UorL},          {"%=",      TOK.PercentEql},
2546     {"!<>",     TOK.UorE},          {"(",       TOK.LParen},
2547     {"!<>=",    TOK.Unordered},     {")",       TOK.RParen},
2548     {".",       TOK.Dot},           {"[",       TOK.LBracket},
2549     {"..",      TOK.Dot2},          {"]",       TOK.RBracket},
2550     {"...",     TOK.Dot3},          {"{",       TOK.LBrace},
2551     {"|",       TOK.Pipe},          {"}",       TOK.RBrace},
2552     {"||",      TOK.Pipe2},         {":",       TOK.Colon},
2553     {"|=",      TOK.PipeEql},       {";",       TOK.Semicolon},
2554     {"?",       TOK.Question},      {",",       TOK.Comma},
2555     {"$",       TOK.Dollar},        {"cam",     TOK.Identifier},
2556     {"çay",     TOK.Identifier},    {".0",      TOK.Float64},
2557     {"0",       TOK.Int32},         {"\n",      TOK.Newline},
2558     {"\r",      TOK.Newline},       {"\r\n",    TOK.Newline},
2559     {"\u2028",  TOK.Newline},       {"\u2029",  TOK.Newline},
2560     {"'c'",     TOK.Character},     {`'\''`,    TOK.Character},
2561     {`"dblq"`,  TOK.String},        {"`raw`",   TOK.String},
2562     {`r"aw"`,   TOK.String},        {`x"0123456789abcdef"`, TOK.String},
2563   ];
2565   version(D2)
2566   {
2567   static Pair[] pairs2 = [
2568     {"@",       TOK.At},
2569     {"^^",      TOK.Caret2},
2570     {"^^=",     TOK.Caret2Eql},
2571     {"=>",      TOK.EqlGreater},
2572     {"q\"ⱷ\n\nⱷ\"", TOK.String},    {`q"(())"`, TOK.String},
2573     {`q"{{}}"`,     TOK.String},    {`q"[[]]"`, TOK.String},
2574     {`q"<<>>"`,     TOK.String},    {`q"/__/"`, TOK.String},
2575     {`q"∆⟵✻⟶∆"`, TOK.String},    {`q"\⣯⣻\"`, TOK.String},
2576     {"q{toks...}",  TOK.String},    {"q{({#line 0\n})}", TOK.String},
2577     {"q\"HDOC\nq\"***\"\nHDOC\"", TOK.String},
2578     {"q\"ȨÖF\nq{***}\nȨÖF\"",   TOK.String},
2579     {`q{q"<>"q"()"q"[]"q"{}"q"//"q"\\"q{}}`,  TOK.String},
2580   ];
2581   }
2582   else // D1
2583   {
2584   static Pair[] pairs2 = [
2585     {"\\n",  TOK.String},           {"\\u2028", TOK.String}
2586   ];
2587   }
2588   pairs ~= pairs2;
2590   char[] src; // The source text to be scanned.
2592   // Join all token texts into a single string.
2593   foreach (i, pair; pairs)
2594     if (pair.kind == TOK.Comment &&
2595         pair.tokenText[1] == '/' || // Line comment.
2596         pair.kind == TOK.Shebang)
2597     {
2598       assert(pairs[i+1].kind == TOK.Newline); // Must be followed by a newline.
2599       src ~= pair.tokenText;
2600     }
2601     else
2602       src ~= pair.tokenText ~ " ";
2604   // Lex the constructed source text.
2605   auto tables = new LexerTables();
2606   auto lx = new Lexer(new SourceText("lexer_unittest", src), tables);
2607   lx.scanAll();
2609   foreach (e; lx.errors)
2610     Stdout.formatln("{}({},{})L: {}", e.filePath, e.loc, e.col, e.getMsg);
2612   auto token = lx.firstToken, last = lx.lastToken;
2614   for (size_t i; i < pairs.length && token < last; ++i, ++token)
2615     if (token.text != pairs[i].tokenText)
2616       assert(0, Format("Scanned ‘{0}’ but expected ‘{1}’",
2617                        escapeNonPrintable(token.text), pairs[i].tokenText));
2618 }
2620 /// Tests the Lexer's peek() method.
2621 void testLexerPeek()
2622 {
2623   scope msg = new UnittestMsg("Testing method Lexer.peek()");
2624   auto tables = new LexerTables();
2625   auto sourceText = new SourceText("", "unittest { }");
2626   auto lx = new Lexer(sourceText, tables);
2627   lx.scanAll();
2629   auto next = lx.head;
2630   lx.peek(next);
2631   assert(next.kind == TOK.Newline);
2632   lx.peek(next);
2633   assert(next.kind == TOK.Unittest);
2634   lx.peek(next);
2635   assert(next.kind == TOK.LBrace);
2636   lx.peek(next);
2637   assert(next.kind == TOK.RBrace);
2638   lx.peek(next);
2639   assert(next.kind == TOK.EOF);
2641   lx = new Lexer(new SourceText("", ""), tables);
2642   lx.scanAll();
2643   next = lx.head;
2644   lx.peek(next);
2645   assert(next.kind == TOK.Newline);
2646   lx.peek(next);
2647   assert(next.kind == TOK.EOF);
2648 }
2650 void testLexerNumbers()
2651 {
2652   // Numbers unittest
2653   // 0L 0ULi 0_L 0_UL 0x0U 0x0p2 0_Fi 0_e2 0_F 0_i
2654   // 0u 0U 0uL 0UL 0L 0LU 0Lu
2655   // 0Li 0f 0F 0fi 0Fi 0i
2656   // 0b_1_LU 0b1000u
2657   // 0x232Lu
2658 }