1 /// Author: Aziz Köksal 2 /// License: GPL3 3 /// $(Maturity very high) 4 module dil.lexer.Lexer; 5 6 import dil.lexer.Token, 7 dil.lexer.Funcs, 8 dil.lexer.Identifier, 9 dil.lexer.IDsEnum, 10 dil.lexer.TokenSerializer, 11 dil.lexer.Tables; 12 import dil.i18n.Messages; 13 import dil.Diagnostics, 14 dil.HtmlEntities, 15 dil.ChunkAllocator, 16 dil.Array, 17 dil.Version, 18 dil.Unicode, 19 dil.SourceText, 20 dil.Time, 21 dil.String; 22 import dil.Float : Float; 23 import util.uni : isUniAlpha; 24 import common; 25 26 /// The Lexer analyzes the characters of a source text and 27 /// produces an array of tokens. 28 class Lexer 29 { 30 cchar* p; /// Points to the current character in the source text. 31 cchar* end; /// Points one character past the end of the source text. 32 SourceText srcText; /// The source text. 33 34 TokenArray tokens; /// Array of Tokens. 35 LexerTables tables; /// Used to look up token values. 36 CharArray buffer; /// A buffer for string values. 37 ChunkAllocator allocator; /// Allocates memory for non-token structs. 38 39 /// Groups line information. 40 static struct LineLoc 41 { 42 cchar* p; /// Points to the first character of the current line. 43 uint n; /// Actual source text line number. 44 } 45 LineLoc lineLoc; /// Current line. 46 47 uint inTokenString; /// > 0 if inside q{ } 48 /// Holds the original file path and the modified one (by #line.) 49 Token.HashLineInfo* hlinfo; /// Info set by "#line". 50 51 // Members used for error messages: 52 Diagnostics diag; /// For diagnostics. 53 LexerError[] errors; /// List of errors. 54 // End of variable members. 55 56 alias T = S2T; /// Converts, e.g., T!"+" to TOK.Plus. 57 58 static 59 { 60 const ushort chars_r = castInt(`r"`); /// `r"` as a ushort. 61 const ushort chars_x = castInt(`x"`); /// `x"` as a ushort. 62 const ushort chars_q = castInt(`q"`); /// `q"` as a ushort. 63 const ushort chars_q2 = castInt(`q{`); /// `q{` as a ushort. 64 const ushort chars_shebang = castInt("#!"); /// `#!` as a ushort. 65 const uint chars_line = castInt("line"); /// `line` as a uint. 66 } 67 68 /// Constructs a Lexer object. 69 /// Params: 70 /// srcText = The UTF-8 source code. 71 /// tables = Used to look up identifiers and token values. 72 /// diag = Used for collecting error messages. 73 this(SourceText srcText, LexerTables tables, Diagnostics diag = null) 74 { 75 version(gc_tokens) 76 {} 77 else 78 this.allocator.initialize(PAGESIZE); 79 this.srcText = srcText; 80 this.tables = tables; 81 this.diag = diag ? diag : new Diagnostics(); 82 assert(text.length >= 4 && text[$-4..$] == SourceText.sentinelString, 83 "source text has no sentinel character"); 84 this.p = text.ptr; 85 this.end = this.p + text.length; // Point past the sentinel string. 86 this.lineLoc.p = this.p; 87 this.lineLoc.n = 1; 88 } 89 90 ~this() 91 { 92 allocator.destroy(); 93 } 94 95 /// Returns the next free token from the array. 96 /// NB: The bytes are not zeroed out. 97 Token* newToken() 98 { 99 if (tokens.rem == 0) 100 tokens.growX1_5(); 101 return tokens.cur++; 102 } 103 104 /// Allocates memory for T. 105 T* new_(T)() 106 { 107 version (gc_tokens) // Use to test GC instead of custom allocator. 108 return new T; 109 else 110 { 111 auto t = cast(T*)allocator.allocate(T.sizeof); 112 *t = T.init; 113 return t; 114 } 115 } 116 117 /// Callback function to TokenSerializer.deserialize(). 118 bool dlxCallback(Token* t) 119 { 120 switch (t.kind) 121 { // Some tokens need special handling: 122 case T!"Newline": 123 setLineBegin(t.end); 124 t.nlval = lookupNewline(); 125 break; 126 case T!"Character": // May have escape sequences. 127 this.p = t.start; 128 scanCharacter(t); 129 break; 130 case T!"String": // Escape sequences; token strings; etc. 131 this.p = t.start; 132 dchar c = *cast(ushort*)p; 133 switch (c) 134 { 135 case chars_r: 136 ++this.p, scanRawString(t); break; 137 case chars_x: 138 scanHexString(t); break; 139 version(D2) 140 { 141 case chars_q: 142 scanDelimitedString(t); break; 143 case chars_q2: 144 scanTokenString(t); break; 145 } 146 default: 147 } 148 switch (*p) 149 { 150 case '`': 151 scanRawString(t); break; 152 case '"': 153 scanNormalString(t); break; 154 version(D1) 155 { // Only in D1. 156 case '\\': 157 scanEscapeString(t); break; 158 } 159 default: 160 } 161 break; 162 case T!"Comment": // Just rescan for newlines. 163 if (t.isMultiline) // Mutliline tokens may have newlines. 164 for (auto p = t.start, end = t.end; p < end;) 165 if (scanNewline(p)) 166 setLineBegin(p); 167 else 168 ++p; 169 break; 170 case T!"Int32", T!"Int64", T!"UInt32", T!"UInt64": 171 this.p = t.start; 172 scanNumber(t); // Complicated. Let the method handle this. 173 break; 174 case T!"Float32", T!"Float64", T!"Float80", 175 T!"IFloat32", T!"IFloat64", T!"IFloat80": 176 // The token is complete. What remains is to get its value. 177 t.mpfloat = lookupFloat(copySansUnderscores(t.start, t.end)); 178 break; 179 case T!"#line": 180 this.p = t.start; 181 scanSpecialTokenSequence(t); // Complicated. Let the method handle this. 182 break; 183 case T!"#!Shebang", T!"Empty": // Whitespace tokens. 184 break; 185 default: 186 } 187 return true; 188 } 189 190 /// Loads the tokens from a dlx file. 191 bool fromDLXFile(ubyte[] data) 192 { 193 auto dlxTokens = TokenSerializer.deserialize( 194 data, this.text(), tables.idents, &dlxCallback); 195 if (dlxTokens.length) 196 { 197 alias ts = dlxTokens; 198 ts[0] = Token.init; // NullToken 199 ts[1].kind = T!"HEAD"; 200 ts[1].ws = null; 201 ts[1].start = ts[1].end = this.text.ptr; 202 ts[1].pvoid = null; 203 ts[2].kind = T!"Newline"; 204 ts[2].ws = null; 205 ts[2].start = ts[2].end = this.text.ptr; 206 ts[2].nlval = lookupNewline(); 207 ts[$-1] = Token.init; // NullToken 208 this.p = ts[$-2].end; 209 tokens.ptr = ts.ptr; 210 tokens.cur = tokens.end = ts.ptr + ts.length; 211 } 212 else 213 { /// Function failed. Reset... 214 this.p = this.text.ptr; 215 this.lineLoc.p = this.p; 216 this.lineLoc.n = 1; 217 } 218 return !!dlxTokens.length; 219 } 220 221 /// Acquires the current buffer. 222 CharArray getBuffer() 223 { 224 auto buffer = this.buffer; 225 this.buffer = buffer.init; 226 return buffer; 227 } 228 229 /// Takes over buffer if its capacity is greater than the current one. 230 void setBuffer(CharArray buffer) 231 { 232 buffer.len = 0; 233 if (buffer.cap > this.buffer.cap) 234 this.buffer = buffer; 235 } 236 237 /// Returns the source text string. 238 cstring text() 239 { 240 return srcText.data; 241 } 242 243 /// Returns the end pointer excluding the sentinel string. 244 cchar* endX() 245 { 246 return this.end - SourceText.sentinelString.length; 247 } 248 249 /// Returns the first token of the source text. 250 /// This can be the EOF token. 251 /// Structure: [NullToken, HEAD, Newline, FirstToken, ..., NullToken] 252 Token* firstToken() 253 { 254 return tokens.ptr + 3; 255 } 256 257 /// Returns the list of tokens excluding special beginning and end tokens. 258 Token[] tokenList() 259 { 260 return firstToken[0 .. tokens.len-4]; 261 } 262 263 /// Returns the HEAD token. 264 Token* head() 265 { 266 return tokens.ptr + 1; 267 } 268 269 /// Returns the EOF token. 270 Token* lastToken() 271 { 272 return tokens.cur - 2; 273 } 274 275 /// Sets the value of the special token. 276 void finalizeSpecialToken(Token* t) 277 { 278 assert(t.kind == T!"SpecialID" && t.text[0..2] == "__"); 279 cstring str; 280 switch (t.ident.idKind) 281 { 282 case IDK.FILE: 283 str = errorFilePath(); 284 break; 285 case IDK.LINE: 286 t.sizet_ = this.errorLineNumber(this.lineNum); 287 break; 288 case IDK.DATE, IDK.TIME, IDK.TIMESTAMP: 289 str = Time.now(); 290 switch (t.kind) 291 { 292 case IDK.DATE: 293 str = Time.month_day(str) ~ ' ' ~ Time.year(str); break; 294 case IDK.TIME: 295 str = Time.time(str); break; 296 case IDK.TIMESTAMP: 297 break; // str is the timestamp. 298 default: assert(0); 299 } 300 break; 301 case IDK.VENDOR: 302 str = VENDOR; 303 break; 304 case IDK.VERSION: 305 t.uint_ = VERSION_MAJOR*1000 + VERSION_MINOR; 306 break; 307 case IDK.EOF: 308 assert(t.text == "__EOF__"); 309 t.kind = T!"EOF"; // Convert to EOF token, so that the Parser will stop. 310 break; 311 default: 312 assert(0); 313 } 314 if (str.ptr) 315 t.strval = lookupString(str, '\0'); 316 } 317 318 /// Returns the current line number. 319 size_t lineNum() 320 { 321 return lineLoc.n; 322 } 323 324 /// Sets the line pointer and increments the line number. 325 private void setLineBegin(cchar* p) 326 { 327 assert(isNewlineEnd(p - 1)); 328 lineLoc.p = p; 329 lineLoc.n++; 330 } 331 332 /// Returns true if p points to the last character of a Newline. 333 bool isNewlineEnd(cchar* p) 334 { 335 assert(p >= text.ptr && p < end); 336 return (*p).In('\n', '\r') || (p-=2) >= text.ptr && p[0..3].In(LS, PS); 337 } 338 339 /// Returns true if p points inside the source text. 340 bool isInText(cchar* p) 341 { 342 return text.ptr <= p && p < end; 343 } 344 345 alias StringValue = Token.StringValue; 346 alias IntegerValue = Token.IntegerValue; 347 alias NewlineValue = Token.NewlineValue; 348 349 /// Looks up a StringValue. Copies str if it's not a slice from the src text. 350 StringValue* lookupString(cstring str, char postfix) 351 { 352 return tables.lookupString(str, postfix, !isInText(str.ptr)); 353 } 354 355 /// Forwards to tables.lookupString(). 356 cbinstr lookupString(cbinstr bstr) 357 { 358 auto str = cast(cstring)bstr; 359 return tables.lookupString(hashOf(str), str); 360 } 361 362 /// Looks up a Float in the table. 363 /// Params: 364 /// str = The zero-terminated string of the float number. 365 Float lookupFloat(cstring str) 366 { 367 assert(str.length && str[$-1] == 0); 368 auto hash = hashOf(str); 369 auto pFloat = hash in tables.floats; 370 if (!pFloat) 371 { 372 int precision; 373 auto f = new Float(precision, str); 374 // if (precision == 0) // Exact precision. 375 // {} 376 // else if (precision < 0) // Lower precision. 377 // {} 378 // else /*if (precision > 0)*/ // Higher precision. 379 // {} 380 tables.floats[hash] = f; 381 return f; 382 } 383 return *pFloat; 384 } 385 386 /// Looks up a newline value. 387 NewlineValue* lookupNewline() 388 { 389 auto lineNum = this.lineNum; 390 if (hlinfo) 391 { // Don't insert into the table, when '#line' tokens are in the text. 392 // This could be optimised with another table. 393 auto nl = new_!(NewlineValue); 394 nl.lineNum = lineNum; 395 auto hlinfo = nl.hlinfo = new_!(Token.HashLineInfo); 396 *hlinfo = *this.hlinfo; 397 return nl; 398 } 399 return tables.lookupNewline(lineNum); 400 } 401 402 /// Advance t one token forward. 403 void peek(ref Token* t) 404 { 405 t++; 406 assert(tokens.ptr <= t && t < tokens.cur); 407 } 408 409 /// Scans the whole source text until EOF is encountered. 410 void scanAll() 411 { // The divisor 6 is an average measured by lexing large D projects. 412 auto estimatedNrOfTokens = text.length / 6; 413 tokens.cap = estimatedNrOfTokens; 414 if (tokens.cap < 5) 415 tokens.cap = 5; // Guarantee space for at least 5 tokens. 416 auto first = newToken(); 417 *first = Token.init; 418 auto head = newToken(); 419 head.kind = T!"HEAD"; 420 head.ws = null; 421 head.start = head.end = this.p; 422 head.pvoid = null; 423 // Add a "virtual" newline as the first token after the head. 424 auto newline = newToken(); 425 newline.kind = T!"Newline"; 426 newline.ws = null; 427 newline.start = newline.end = this.p; 428 newline.nlval = lookupNewline(); 429 // Scan optional shebang. 430 if (*cast(ushort*)this.p == chars_shebang) 431 scanShebang(); 432 // Main loop scanning the whole text. 433 Token* t; 434 do 435 scan(t = newToken()); 436 while (t.kind != T!"EOF"); 437 // Add a terminating token, similar to 0 in C-like strings. 438 auto last = newToken(); 439 *last = Token.init; 440 441 auto toks = tokenList; 442 foreach (x; toks) 443 {} 444 } 445 446 /// The "shebang" may optionally appear once at the beginning of a file. 447 /// $(BNF Shebang := "#!" AnyChar* EndOfLine) 448 void scanShebang() 449 { 450 auto p = this.p; 451 assert(p[0..2] == "#!"); 452 auto t = newToken(); 453 t.kind = T!"#!Shebang"; 454 t.start = p++; 455 while (!isEndOfLine(++p)) 456 isascii(*p) || decodeUTF8(p); 457 t.end = this.p = p; 458 t.pvoid = null; 459 } 460 461 /// The main method which recognizes the characters that make up a token. 462 /// 463 /// Complicated tokens are scanned in separate methods. 464 public void scan(Token* t) 465 in 466 { 467 assert(text.ptr <= p && p < end); 468 } 469 out 470 { 471 assert(text.ptr <= t.start && t.start < end, t.kind.toString); 472 assert(text.ptr <= t.end && t.end <= end, t.kind.toString); 473 assert(t.kind != T!"Invalid", t.text); 474 } 475 body 476 { 477 TOK kind; // The token kind that will be assigned to t.kind. 478 auto p = this.p; // Incrementing a stack variable is faster. 479 // Scan whitespace. 480 if (isspace(*p)) 481 { 482 t.ws = p; 483 while (isspace(*++p)) 484 {} 485 } 486 else 487 t.ws = null; 488 t.pvoid = null; 489 490 // Scan the text of the token. 491 dchar c = *p; 492 { 493 t.start = this.p = p; 494 495 // Identifier or string literal. 496 if (isidbeg(c)) 497 { 498 c = *cast(ushort*)p; 499 if (c == chars_r) 500 return ++this.p, scanRawString(t); 501 if (c == chars_x) 502 return scanHexString(t); 503 version(D2) 504 { 505 if (c == chars_q) 506 return scanDelimitedString(t); 507 if (c == chars_q2) 508 return scanTokenString(t); 509 } 510 511 // Scan identifier. 512 Lidentifier: 513 do 514 { c = *++p; } 515 while (isident(c) || !isascii(c) && scanUnicodeAlpha(p)); 516 t.end = this.p = p; 517 518 auto id = tables.lookupIdentifier(t.text); 519 t.kind = id.kind; 520 t.ident = id; 521 assert(t.isKeyword || id.kind.In(T!"SpecialID", T!"Identifier")); 522 523 if (kind == T!"SpecialID") 524 finalizeSpecialToken(t); 525 return; 526 } 527 528 /// Advances p if p[1] equals x. 529 bool next(cchar x) 530 { 531 return p[1] == x ? (++p, 1) : 0; 532 } 533 534 // Newline. 535 if (*p == '\n' || *p == '\r' && (next('\n'), true)) 536 goto Lnewline; 537 538 assert(this.p == p); 539 if (isdigit(c)) 540 return scanNumber(t); 541 542 switch (c) 543 { 544 // Cases are sorted roughly according to times of occurrence. 545 mixin(cases(",", "(", ")", ";", "{", "}", "[", "]", ":")); 546 case '.': /* . .[0-9] .. ... */ 547 if (next('.')) 548 kind = next('.') ? T!"..." : T!".."; 549 else if (isdigit(p[1])) 550 return (this.p = p), scanFloat(t); 551 else 552 kind = T!"."; 553 goto Lcommon; 554 case '=': /* = == => */ 555 kind = next('=') ? T!"==" : (next('>') ? T!"=>" : T!"="); 556 goto Lcommon; 557 case '`': 558 return scanRawString(t); 559 case '"': 560 return scanNormalString(t); 561 version(D1) 562 { // Only in D1. 563 case '\\': 564 return scanEscapeString(t); 565 } 566 case '\'': 567 return scanCharacter(t); 568 case '/': 569 switch (*++p) 570 { 571 case '=': 572 kind = T!"/="; 573 goto Lcommon; 574 case '+': 575 return (this.p = p), scanNestedComment(t); 576 case '*': 577 return (this.p = p), scanBlockComment(t); 578 case '/': // LineComment. 579 while (!isEndOfLine(++p)) 580 isascii(*p) || decodeUTF8(p); 581 kind = T!"Comment"; 582 goto Lreturn; 583 default: 584 kind = T!"/"; 585 goto Lreturn; 586 } 587 assert(0); 588 case '>': /* > >= >> >>= >>> >>>= */ 589 switch (*++p) 590 { 591 case '=': 592 kind = T!">="; 593 goto Lcommon; 594 case '>': 595 if (next('>')) 596 kind = next('=') ? T!">>>=" : T!">>>"; 597 else 598 kind = next('=') ? T!">>=" : T!">>"; 599 goto Lcommon; 600 default: 601 kind = T!">"; 602 goto Lreturn; 603 } 604 assert(0); 605 case '<': /* < <= <> <>= << <<= */ 606 switch (*++p) 607 { 608 case '=': 609 kind = T!"<="; 610 goto Lcommon; 611 case '<': 612 kind = next('=') ? T!"<<=" : T!"<<"; 613 goto Lcommon; 614 case '>': 615 kind = next('=') ? T!"<>=" : T!"<>"; 616 goto Lcommon; 617 default: 618 kind = T!"<"; 619 goto Lreturn; 620 } 621 assert(0); 622 case '!': /* ! !< !> !<= !>= !<> !<>= */ 623 switch (*++p) 624 { 625 case '<': 626 if (next('>')) 627 kind = next('=') ? T!"!<>=" : T!"!<>"; 628 else 629 kind = next('=') ? T!"!<=" : T!"!<"; 630 goto Lcommon; 631 case '>': 632 kind = next('=') ? T!"!>=" : T!"!>"; 633 goto Lcommon; 634 case '=': 635 kind = T!"!="; 636 goto Lcommon; 637 default: 638 kind = T!"!"; 639 goto Lreturn; 640 } 641 assert(0); 642 case '|': /* | || |= */ 643 kind = next('=') ? T!"|=" : (next('|') ? T!"||" : T!"|"); 644 goto Lcommon; 645 case '&': /* & && &= */ 646 kind = next('=') ? T!"&=" : (next('&') ? T!"&&" : T!"&"); 647 goto Lcommon; 648 case '+': /* + ++ += */ 649 kind = next('=') ? T!"+=" : (next('+') ? T!"++" : T!"+"); 650 goto Lcommon; 651 case '-': /* - -- -= */ 652 kind = next('=') ? T!"-=" : (next('-') ? T!"--" : T!"-"); 653 goto Lcommon; 654 case '~': /* ~ ~= */ 655 kind = next('=') ? T!"~=" : T!"~"; 656 goto Lcommon; 657 case '*': /* * *= */ 658 kind = next('=') ? T!"*=" : T!"*"; 659 goto Lcommon; 660 version(D2) 661 { 662 case '^': /* ^ ^= ^^ ^^= */ 663 if (next('=')) 664 kind = T!"^="; 665 else if (next('^')) 666 kind = next('=') ? T!"^^=" : T!"^^"; 667 else 668 kind = T!"^"; 669 goto Lcommon; 670 } // end of version(D2) 671 else 672 { 673 case '^': /* ^ ^= */ 674 kind = next('=') ? T!"^=" : T!"^"; 675 goto Lcommon; 676 } 677 case '%': /* % %= */ 678 kind = next('=') ? T!"%=" : T!"%"; 679 goto Lcommon; 680 // Single character tokens: 681 mixin(cases("@","$","?")); 682 case '#': 683 assert(this.p == p); 684 return scanSpecialTokenSequence(t); 685 default: 686 } 687 688 // Check for EOF 689 if (isEOF(c)) 690 { 691 assert(isEOF(*p), ""~*p); 692 kind = T!"EOF"; 693 assert(t.start == p); 694 goto Lreturn; 695 } 696 697 assert(this.p == p); 698 if (!isascii(c) && isUniAlpha(c = decodeUTF8(p))) 699 goto Lidentifier; 700 701 if (isUnicodeNewlineChar(c)) 702 goto Lnewline; 703 704 error(t.start, MID.IllegalCharacter, cast(dchar)c); 705 706 kind = T!"Illegal"; 707 t.dchar_ = c; 708 goto Lcommon; 709 } 710 711 Lcommon: 712 ++p; 713 Lreturn: 714 t.kind = kind; 715 t.end = this.p = p; 716 return; 717 718 Lnewline: 719 setLineBegin(++p); 720 t.kind = T!"Newline"; 721 t.nlval = lookupNewline(); 722 t.end = this.p = p; 723 return; 724 } 725 726 /// Generates case statements for token strings. 727 /// --- 728 //// // case_("<") -> 729 /// case 60u: 730 /// kind = T!"<"; 731 /// goto Lcommon; 732 /// --- 733 static char[] cases(string[] strs...) 734 { 735 char[] result; 736 foreach (str; strs) 737 { 738 char[] label_str = "Lcommon".dup; 739 if (str.length != 1) // Append length as a suffix. 740 label_str ~= '0' + cast(char)str.length; 741 result ~= `case castInt("`~str~`"): kind = T!"`~str~`"; `~ 742 "goto "~label_str~";\n"; 743 } 744 return result; 745 } 746 //pragma(msg, cases("<", ">")); 747 748 /// An alternative scan method. 749 /// Profiling shows it's a bit slower. 750 public void scan_(Token* t) 751 in 752 { 753 assert(text.ptr <= p && p < end); 754 } 755 out 756 { 757 assert(text.ptr <= t.start && t.start < end, t.kind.toString); 758 assert(text.ptr <= t.end && t.end <= end, t.kind.toString); 759 assert(t.kind != T!"Invalid", t.text); 760 } 761 body 762 { 763 TOK kind; // The token kind that will be assigned to t.kind. 764 auto p = this.p; // Incrementing a stack variable is faster. 765 // Scan whitespace. 766 if (isspace(*p)) 767 { 768 t.ws = p; 769 while (isspace(*++p)) 770 {} 771 } 772 else 773 t.ws = null; 774 t.pvoid = null; 775 776 // Scan a token. 777 t.start = this.p = p; 778 779 uint c = *p; 780 781 assert(p == t.start); 782 // Check for ids first, as they occur the most often in source codes. 783 if (isidbeg(c)) 784 { 785 c = *cast(ushort*)p; 786 if (c == chars_r) 787 return (this.p = ++p), scanRawString(t); 788 if (c == chars_x) 789 return scanHexString(t); 790 version(D2) 791 { 792 if (c == chars_q) 793 return scanDelimitedString(t); 794 if (c == chars_q2) 795 return scanTokenString(t); 796 } 797 798 // Scan an identifier. 799 Lidentifier: 800 do 801 { c = *++p; } 802 while (isident(c) || !isascii(c) && scanUnicodeAlpha(p)); 803 t.end = this.p = p; 804 805 auto id = tables.lookupIdentifier(t.text); 806 t.kind = id.kind; 807 t.ident = id; 808 assert(t.isKeyword || id.kind.In(T!"SpecialID", T!"Identifier")); 809 810 if (kind == T!"SpecialID") 811 finalizeSpecialToken(t); 812 return; 813 } 814 815 if (isdigit(c)) 816 return scanNumber(t); 817 818 819 // Thanks to the 4 zeros terminating the text, 820 // it is possible to look ahead 4 characters. 821 c = *cast(uint*)p; 822 823 // 4 character tokens. 824 switch (c) 825 { 826 mixin(cases(">>>=", "!<>=")); 827 default: 828 } 829 830 version(BigEndian) 831 c >>>= 8; 832 else 833 c &= 0x00FFFFFF; 834 assert(p == t.start); 835 // 3 character tokens. 836 switch (c) 837 { 838 mixin(cases("<<=", ">>=", ">>>", "...", 839 "!<=", "!>=", "!<>", "<>=", "^^=")); 840 case castInt(LS), castInt(PS): 841 p += 2; 842 goto Lnewline; 843 default: 844 } 845 846 version(BigEndian) 847 c >>>= 8; 848 else 849 c &= 0x0000FFFF; 850 assert(p == t.start); 851 // 2 character tokens. 852 switch (c) 853 { 854 case castInt("/+"): 855 this.p = ++p; // Skip / 856 return scanNestedComment(t); 857 case castInt("/*"): 858 this.p = ++p; // Skip / 859 return scanBlockComment(t); 860 case castInt("//"): // LineComment. 861 ++p; // Skip / 862 assert(*p == '/'); 863 while (!isEndOfLine(++p)) 864 isascii(*p) || decodeUTF8(p); 865 kind = T!"Comment"; 866 goto Lreturn; 867 mixin(cases("<=", ">=", "<<", ">>", "==", "=>", "!=", "!<", "!>", "<>", 868 "..", "&&", "&=", "||", "|=", "++", "+=", "--", "-=", "*=", "/=", "%=", 869 "^=", "~=", "^^")); 870 case castInt("\r\n"): 871 ++p; 872 goto Lnewline; 873 default: 874 } 875 876 static TOK[127] char2TOK = [ 877 '<': T!"<", '>': T!">", '^': T!"^", '!': T!"!", 878 '&': T!"&", '|': T!"|", '+': T!"+", '-': T!"-", 879 '=': T!"=", '~': T!"~", '*': T!"*", '/': T!"/", 880 '%': T!"%", '(': T!"(", ')': T!")", '[': T!"[", 881 ']': T!"]", '{': T!"{", '}': T!"}", ':': T!":", 882 ';': T!";", '?': T!"?", ',': T!",", '$': T!"$", 883 '@': T!"@" 884 ]; 885 886 version(BigEndian) 887 c >>>= 8; 888 else 889 c &= 0x000000FF; 890 assert(p == t.start); 891 assert(*p == c, Format("p={0},c={1}", *p, cast(dchar)c)); 892 // 1 character tokens. 893 // TODO: consider storing the token type in ptable. 894 if (c < 127 && (kind = char2TOK[c]) != 0) 895 goto Lcommon; 896 897 assert(this.p == p); 898 switch (c) 899 { 900 case '\r', '\n': 901 goto Lnewline; 902 case '\'': 903 return scanCharacter(t); 904 case '`': 905 return scanRawString(t); 906 case '"': 907 return scanNormalString(t); 908 version(D2) 909 {} 910 else { // Only in D1. 911 case '\\': 912 return scanEscapeString(t); 913 } 914 case '.': 915 if (isdigit(p[1])) 916 return (this.p = p), scanFloat(t); 917 kind = T!"."; 918 ++p; 919 goto Lreturn; 920 case '#': 921 assert(this.p == p); 922 return scanSpecialTokenSequence(t); 923 default: 924 } 925 926 assert(p == t.start); 927 assert(*p == c); 928 929 // Check for EOF 930 if (isEOF(c)) 931 { 932 assert(isEOF(*p), *p~""); 933 kind = T!"EOF"; 934 assert(t.start == p); 935 goto Lreturn; 936 } 937 938 if (!isascii(c) && isUniAlpha(c = decodeUTF8(p))) 939 goto Lidentifier; 940 941 error(t.start, MID.IllegalCharacter, cast(dchar)c); 942 943 kind = T!"Illegal"; 944 t.dchar_ = c; 945 goto Lcommon; 946 947 Lcommon4: 948 ++p; 949 Lcommon3: 950 ++p; 951 Lcommon2: 952 ++p; 953 Lcommon: 954 ++p; 955 Lreturn: 956 t.kind = kind; 957 t.end = this.p = p; 958 return; 959 960 Lnewline: 961 setLineBegin(++p); 962 t.kind = T!"Newline"; 963 t.nlval = lookupNewline(); 964 t.end = this.p = p; 965 return; 966 } 967 968 /// Scans a block comment. 969 /// 970 /// $(BNF BlockComment := "/*" AnyChar* "*/") 971 void scanBlockComment(Token* t) 972 { 973 auto p = this.p; 974 assert((p-1)[0..2] == "/*"); 975 auto tokenLine = this.lineLoc; 976 Loop: 977 while (1) 978 switch (*++p) 979 { 980 case '*': 981 if (p[1] != '/') 982 continue; 983 p += 2; 984 break Loop; 985 case '\r': 986 if (p[1] == '\n') 987 ++p; 988 goto case; 989 case '\n': 990 setLineBegin(p+1); 991 break; 992 default: 993 if (!isascii(*p)) 994 { 995 if (isUnicodeNewlineChar(decodeUTF8(p))) 996 goto case '\n'; 997 } 998 else if (isEOF(*p)) { 999 error(tokenLine, t.start, MID.UnterminatedBlockComment); 1000 break Loop; 1001 } 1002 } 1003 t.kind = T!"Comment"; 1004 t.end = this.p = p; 1005 return; 1006 } 1007 1008 /// Scans a nested comment. 1009 /// 1010 /// $(BNF NestedComment := "/+" (NestedComment | AnyChar)* "+/") 1011 void scanNestedComment(Token* t) 1012 { 1013 auto p = this.p; 1014 assert((p-1)[0..2] == "/+"); 1015 auto tokenLine = this.lineLoc; 1016 uint level = 1; 1017 Loop: 1018 while (1) 1019 switch (*++p) 1020 { 1021 case '/': 1022 if (p[1] == '+') 1023 ++p, ++level; 1024 continue; 1025 case '+': 1026 if (p[1] != '/') 1027 continue; 1028 ++p; 1029 if (--level != 0) 1030 continue; 1031 ++p; 1032 break Loop; 1033 case '\r': 1034 if (p[1] == '\n') 1035 ++p; 1036 goto case; 1037 case '\n': 1038 setLineBegin(p+1); 1039 break; 1040 default: 1041 if (!isascii(*p)) 1042 { 1043 if (isUnicodeNewlineChar(decodeUTF8(p))) 1044 goto case '\n'; 1045 } 1046 else if (isEOF(*p)) { 1047 error(tokenLine, t.start, MID.UnterminatedNestedComment); 1048 break Loop; 1049 } 1050 } 1051 t.kind = T!"Comment"; 1052 t.end = this.p = p; 1053 return; 1054 } 1055 1056 /// Scans the postfix character of a string literal. 1057 /// 1058 /// $(BNF PostfixChar := "c" | "w" | "d") 1059 static char scanPostfix(ref cchar* p) 1060 { 1061 assert(p[-1].In('"', '`', '}')); 1062 return (*p).In('c', 'w', 'd') ? *p++ : '\0'; 1063 } 1064 1065 /// Scans a normal string literal. 1066 /// 1067 /// $(BNF NormalStringLiteral := '"' (EscapeSequence | AnyChar)* '"') 1068 void scanNormalString(Token* t) 1069 { 1070 auto p = this.p; 1071 assert(*p == '"'); 1072 auto tokenLine = this.lineLoc; 1073 t.kind = T!"String"; 1074 auto value = getBuffer(); 1075 auto prev = ++p; // Skip '"'. prev is used to copy chunks to value. 1076 cchar* prev2; 1077 1078 while (*p != '"') 1079 switch (*p) 1080 { 1081 case '\\': 1082 if (prev != p) value ~= slice(prev, p); 1083 bool isBinary; 1084 auto c = scanEscapeSequence(p, isBinary); 1085 if (isascii(c) || isBinary) 1086 value ~= cast(char)c; 1087 else 1088 encodeUTF8(value, c); 1089 prev = p; 1090 break; 1091 case '\r': 1092 prev2 = p; 1093 if (p[1] == '\n') 1094 ++p; 1095 LconvertNewline: 1096 value ~= slice(prev, prev2 + 1); // +1 is for '\n'. 1097 *(value.cur-1) = '\n'; // Convert Newline to '\n'. 1098 prev = p+1; 1099 goto case; 1100 case '\n': 1101 setLineBegin(++p); 1102 break; 1103 case 0, _Z_: 1104 error(tokenLine, t.start, MID.UnterminatedString); 1105 goto Lerror; 1106 default: 1107 if (!isascii(*p) && isUnicodeNewlineChar(decodeUTF8(p))) 1108 { 1109 prev2 = p - 2; 1110 goto LconvertNewline; 1111 } 1112 ++p; 1113 } 1114 assert(*p == '"'); 1115 1116 { 1117 auto finalString = slice(prev, p); 1118 if (value.len) 1119 finalString = ((value ~= finalString), value[]); // Append previous string. 1120 ++p; // Skip '"'. 1121 t.strval = lookupString(finalString, scanPostfix(p)); 1122 } 1123 Lerror: 1124 t.end = this.p = p; 1125 setBuffer(value); 1126 return; 1127 } 1128 1129 /// Scans an escape string literal. 1130 /// 1131 /// $(BNF EscapeStringLiteral := EscapeSequence+ ) 1132 void scanEscapeString(Token* t) 1133 { 1134 version(D1) 1135 { 1136 assert(*p == '\\'); 1137 auto value = getBuffer(); 1138 do 1139 { 1140 bool isBinary; 1141 auto c = scanEscapeSequence(p, isBinary); 1142 if (isascii(c) || isBinary) 1143 value ~= cast(char)c; 1144 else 1145 encodeUTF8(value, c); 1146 } while (*p == '\\'); 1147 t.strval = lookupString(value, '\0'); 1148 t.kind = T!"String"; 1149 t.end = p; 1150 setBuffer(value); 1151 } 1152 } 1153 1154 /// Scans a character literal. 1155 /// 1156 /// $(BNF CharacterLiteral := "'" (EscapeSequence | AnyChar) "'") 1157 void scanCharacter(Token* t) 1158 { 1159 assert(*p == '\''); 1160 t.kind = T!"Character"; 1161 switch (*++p) 1162 { 1163 case '\\': 1164 bool notused; 1165 t.dchar_ = scanEscapeSequence(p, notused); 1166 break; 1167 case '\'': 1168 error(t.start, MID.EmptyCharacterLiteral); 1169 break; 1170 default: 1171 if (isEndOfLine(p)) 1172 break; 1173 t.dchar_ = isascii(*p) ? *p : decodeUTF8(p); 1174 ++p; 1175 } 1176 1177 if (*p == '\'') 1178 ++p; 1179 else 1180 error(t.start, MID.UnterminatedCharacterLiteral); 1181 t.end = p; 1182 } 1183 1184 /// Scans a raw string literal. 1185 /// 1186 /// $(BNF RawStringLiteral := 'r"' AnyChar* '"' | "`" AnyChar* "`") 1187 void scanRawString(Token* t) 1188 { 1189 auto p = this.p; 1190 assert(*p == '`' || (p-1)[0..2] == `r"`); 1191 auto tokenLine = this.lineLoc; 1192 t.kind = T!"String"; 1193 uint delim = *p; 1194 auto value = getBuffer(); 1195 auto prev = ++p; 1196 cchar* prev2; 1197 1198 while (*p != delim) 1199 switch (*p) 1200 { 1201 case '\r': 1202 prev2 = p; 1203 if (p[1] == '\n') 1204 ++p; 1205 LconvertNewline: 1206 value ~= slice(prev, prev2 + 1); 1207 *(value.cur-1) = '\n'; // Convert Newline to '\n'. 1208 prev = p+1; 1209 goto case; 1210 case '\n': 1211 setLineBegin(++p); 1212 break; 1213 case 0, _Z_: 1214 error(tokenLine, t.start, (delim == '"' ? 1215 MID.UnterminatedRawString : MID.UnterminatedBackQuoteString)); 1216 goto Lerror; 1217 default: 1218 if (!isascii(*p) && isUnicodeNewlineChar(decodeUTF8(p))) 1219 { 1220 prev2 = p - 2; 1221 goto LconvertNewline; 1222 } 1223 ++p; 1224 } 1225 assert((*p).In('"', '`')); 1226 1227 { 1228 auto finalString = slice(prev, p); 1229 if (value.len) 1230 finalString = ((value ~= finalString), value[]); // Append previous string. 1231 ++p; // Skip '"' or '`'. 1232 t.strval = lookupString(finalString, scanPostfix(p)); 1233 } 1234 Lerror: 1235 t.end = this.p = p; 1236 setBuffer(value); 1237 return; 1238 } 1239 1240 /// Scans a hexadecimal string literal. 1241 /// 1242 /// $(BNF HexStringLiteral := 'x"' (HexDigit HexDigit)* '"' 1243 ////HexDigit := [a-fA-F\d]) 1244 void scanHexString(Token* t) 1245 { 1246 auto p = this.p; 1247 assert(p[0..2] == `x"`); 1248 t.kind = T!"String"; 1249 1250 auto tokenLine = this.lineLoc; 1251 1252 auto value = getBuffer(); 1253 ubyte h; // Current hex number. 1254 bool odd; // True if one hex digit has been scanned previously. 1255 1256 ++p; 1257 assert(*p == '"'); 1258 while (*++p != '"') 1259 switch (*p) 1260 { 1261 case '\r': 1262 if (p[1] == '\n') 1263 ++p; 1264 goto case; 1265 case '\n': 1266 setLineBegin(p+1); 1267 continue; 1268 default: 1269 dchar c = *p; 1270 if (hex2val(c)) 1271 { 1272 if (odd) 1273 value ~= cast(ubyte)(h << 4 | c); 1274 else 1275 h = cast(ubyte)c; 1276 odd = !odd; 1277 } 1278 else if (isspace(c)) 1279 continue; // Skip spaces. 1280 else if (isEOF(c)) { 1281 error(tokenLine, t.start, MID.UnterminatedHexString); 1282 goto Lerror; 1283 } 1284 else 1285 { 1286 auto errorAt = p; 1287 if (!isascii(c) && isUnicodeNewlineChar(c = decodeUTF8(p))) 1288 goto case '\n'; 1289 error(errorAt, MID.NonHexCharInHexString, cast(dchar)c); 1290 } 1291 } 1292 if (odd) 1293 error(tokenLine, t.start, MID.OddNumberOfDigitsInHexString); 1294 ++p; 1295 t.strval = lookupString(value[], scanPostfix(p)); 1296 Lerror: 1297 t.end = this.p = p; 1298 setBuffer(value); 1299 return; 1300 } 1301 1302 /// Scans a delimited string literal. 1303 /// 1304 /// $(BNF 1305 ////DelimitedStringLiteral := 'q"' OpeningDelim AnyChar* MatchingDelim '"' 1306 ////OpeningDelim := "[" | "(" | "{" | "<" | Identifier EndOfLine 1307 ////MatchingDelim := "]" | ")" | "}" | ">" | EndOfLine Identifier 1308 ////) 1309 void scanDelimitedString(Token* t) 1310 { 1311 version(D2) 1312 { 1313 auto p = this.p; 1314 assert(p[0..2] == `q"`); 1315 t.kind = T!"String"; 1316 1317 auto tokenLine = this.lineLoc; 1318 1319 auto value = getBuffer(); 1320 dchar nesting_delim, // '[', '(', '<', '{', or 0 if no nesting delimiter. 1321 closing_delim; // Will be ']', ')', '>', '}, 1322 // the first character of an identifier or 1323 // any other Unicode/ASCII character. 1324 cstring str_delim; // Identifier delimiter. 1325 uint level = 1; // Counter for nestable delimiters. 1326 1327 ++p; ++p; // Skip q" 1328 auto prev = p; 1329 cchar* prev2; 1330 dchar c = *p; 1331 // Scan the delimiter. 1332 switch (c) 1333 { 1334 case '(': 1335 nesting_delim = c; 1336 closing_delim = ')'; // c + 1 1337 break; 1338 case '[', '<', '{': 1339 nesting_delim = c; 1340 // Get to the closing counterpart. Feature of ASCII table. 1341 closing_delim = c + 2; // ']', '>' or '}' 1342 break; 1343 default: 1344 if (isNewline(p)) 1345 { 1346 error(p, MID.DelimiterIsMissing); 1347 goto Lerror; 1348 } 1349 1350 auto idbegin = p; 1351 closing_delim = isascii(c) ? c : decodeUTF8(p); 1352 1353 if (isidbeg(closing_delim) || isUniAlpha(closing_delim)) 1354 { // Scan: Identifier Newline 1355 do 1356 { c = *++p; } 1357 while (isident(c) || !isascii(c) && scanUnicodeAlpha(p)); 1358 str_delim = slice(idbegin, p); // Scanned identifier delimiter. 1359 if (scanNewline(p)) 1360 setLineBegin(p); 1361 else 1362 error(p, MID.NoNewlineAfterIdDelimiter, str_delim); 1363 --p; // Go back one because of "c = *++p;" in main loop. 1364 } 1365 } 1366 assert(closing_delim); 1367 1368 if (isspace(closing_delim)) 1369 error(p, MID.DelimiterIsWhitespace); 1370 1371 bool checkStringDelim(cchar* p) 1372 { // Returns true if p points to the closing string delimiter. 1373 assert(str_delim.length != 0, ""~*p); 1374 return this.lineLoc.p is p && // Must be at the beginning of a new line. 1375 this.endX()-p >= str_delim.length && // Check remaining length. 1376 p[0..str_delim.length] == str_delim; // Compare. 1377 } 1378 1379 // Scan the contents of the string. 1380 while (1) 1381 switch (c = *++p) 1382 { 1383 case '\r': 1384 prev2 = p; 1385 if (p[1] == '\n') 1386 ++p; 1387 LconvertNewline: 1388 value ~= slice(prev, prev2 + 1); // +1 is for '\n'. 1389 *(value.cur-1) = '\n'; // Convert Newline to '\n'. 1390 prev = p+1; 1391 goto case; 1392 case '\n': 1393 setLineBegin(p+1); 1394 break; 1395 case 0, _Z_: 1396 error(tokenLine, t.start, MID.UnterminatedDelimitedString); 1397 goto Lerror; 1398 default: 1399 prev2 = p; 1400 if (!isascii(c)) 1401 { // Unicode branch. 1402 c = decodeUTF8(p); 1403 if (isUnicodeNewlineChar(c)) 1404 goto LconvertNewline; 1405 if (c == closing_delim) 1406 if (str_delim.length) 1407 { // Matched first character of the string delimiter. 1408 if (checkStringDelim(prev2)) 1409 { 1410 p = prev2 + str_delim.length; 1411 goto Lreturn2; 1412 } 1413 } 1414 else 1415 { 1416 assert(level == 1); 1417 --level; 1418 goto Lreturn; 1419 } 1420 } 1421 else // ASCII branch. 1422 if (c == nesting_delim) 1423 ++level; 1424 else if (c == closing_delim) 1425 if (str_delim.length) 1426 { // Matched first character of the string delimiter. 1427 if (checkStringDelim(p)) 1428 { 1429 p += str_delim.length; 1430 goto Lreturn2; 1431 } 1432 } 1433 else if (--level == 0) 1434 goto Lreturn; 1435 } 1436 Lreturn: // Character delimiter. 1437 assert(c == closing_delim); 1438 assert(level == 0); 1439 ++p; // Skip closing delimiter. 1440 Lreturn2: // String delimiter. 1441 { 1442 auto finalString = slice(prev, prev2); 1443 if (value.len) 1444 finalString = ((value ~= finalString), value[]); // Append previous string. 1445 1446 char postfix; 1447 if (*p == '"') 1448 postfix = scanPostfix(++p); 1449 else 1450 { // Pass str_delim or encode and pass closing_delim as a string. 1451 if (!str_delim.length) 1452 { 1453 char[] tmp; 1454 encode(tmp, closing_delim); 1455 str_delim = tmp; 1456 } 1457 error(p, MID.ExpectedDblQuoteAfterDelim, str_delim); 1458 } 1459 t.strval = lookupString(finalString, postfix); 1460 } 1461 Lerror: 1462 t.end = this.p = p; 1463 setBuffer(value); 1464 } // version(D2) 1465 } 1466 1467 /// Scans a token string literal. 1468 /// 1469 /// $(BNF TokenStringLiteral := "q{" Token* "}") 1470 void scanTokenString(Token* t) 1471 { 1472 version(D2) 1473 { 1474 assert(p[0..2] == `q{`); 1475 t.kind = T!"String"; 1476 1477 auto tokenLine = this.lineLoc; 1478 1479 ++inTokenString; // A guard against changes to 'this.hlinfo'. 1480 1481 ++p; ++p; // Skip q{ 1482 cchar* str_begin = p, str_end; // Inner string. 1483 TokenArray innerTokens; // The tokens inside this string. 1484 innerTokens.cap = 1; 1485 // Set to true, if '\r', LS, PS, or multiline tokens are encountered. 1486 bool convertNewlines; 1487 1488 Token* new_t; 1489 uint level = 1; // Current nesting level of curly braces. 1490 Loop: 1491 while (1) 1492 { 1493 if (innerTokens.rem == 0) 1494 innerTokens.growX1_5(); 1495 scan(new_t = innerTokens.cur++); 1496 switch (new_t.kind) 1497 { 1498 case T!"{": 1499 ++level; 1500 break; 1501 case T!"}": 1502 if (--level == 0) 1503 break Loop; 1504 break; 1505 case T!"String", T!"Comment": 1506 if (new_t.isMultiline()) 1507 convertNewlines = true; 1508 break; 1509 case T!"Newline": 1510 if (*new_t.start != '\n') 1511 convertNewlines = true; 1512 break; 1513 case T!"EOF": 1514 error(tokenLine, t.start, MID.UnterminatedTokenString); 1515 this.p = new_t.ws ? new_t.ws : new_t.start; // Reset. 1516 break Loop; 1517 default: 1518 } 1519 } 1520 assert(new_t.kind.In(T!"}", T!"EOF")); 1521 1522 char postfix; 1523 if (new_t.kind == T!"EOF") 1524 str_end = t.end = p; 1525 else 1526 { 1527 str_end = p-1; 1528 postfix = scanPostfix(p); 1529 t.end = p; 1530 } 1531 *new_t = Token.init; // Terminate with a "0-token". 1532 1533 auto value = slice(str_begin, str_end); 1534 // Convert newlines to '\n'. 1535 if (convertNewlines) 1536 { // Copy the value and convert the newlines. 1537 auto tmp = getBuffer(); 1538 tmp.len = value.length; 1539 auto q = str_begin; // Reader. 1540 auto s = tmp.ptr; // Writer. 1541 for (; q < str_end; ++q) 1542 switch (*q) 1543 { 1544 case '\r': 1545 if (q[1] == '\n') 1546 ++q; 1547 goto case; 1548 case '\n': 1549 assert(isNewlineEnd(q)); 1550 *s++ = '\n'; // Convert Newline to '\n'. 1551 break; 1552 default: 1553 if (isUnicodeNewline(q)) 1554 { 1555 ++q; ++q; 1556 goto case '\n'; 1557 } 1558 *s++ = *q; // Copy current character. 1559 } 1560 tmp.len = s - tmp.ptr; 1561 value = tmp[]; 1562 setBuffer(tmp); 1563 } 1564 1565 auto strval = new_!(StringValue); 1566 strval.str = lookupString(cast(cbinstr)value); 1567 strval.pf = postfix; 1568 strval.tokens = innerTokens.ptr; 1569 t.strval = strval; 1570 1571 --inTokenString; 1572 } // version(D2) 1573 } 1574 1575 /// Scans an escape sequence. 1576 /// 1577 /// $(BNF 1578 ////EscapeSequence := "\\" (BinaryEsc | UnicodeEsc | CEsc | HTMLEsc) 1579 ////BinaryEsc := Octal{1,3} | "x" Hex{2} 1580 ////UnicodeEsc := "u" Hex{4} | "U" Hex{8} 1581 ////CEsc := "'" | '"' | "?" | "\\" | "a" | "b" | "f" | "n" | "r" | "t" | "v" 1582 ////HTMLEsc := "&" EntityName ";" 1583 ////EntityName := [a-zA-Z] [a-zA-Z\d]* 1584 ////) 1585 /// Params: 1586 /// ref_p = Used to scan the sequence. 1587 /// isBinary = Set to true for octal and hexadecimal escapes. 1588 /// Returns: The escape value. 1589 dchar scanEscapeSequence(ref cchar* ref_p, out bool isBinary) 1590 out(result) 1591 { assert(isValidChar(result)); } 1592 body 1593 { 1594 auto p = ref_p; 1595 assert(*p == '\\'); 1596 // Used for error reporting. 1597 MID mid; 1598 cstring err_arg; 1599 1600 ++p; // Skip '\\'. 1601 dchar c = char2ev(*p); // Table lookup. 1602 if (c) 1603 { 1604 ++p; 1605 goto Lreturn; 1606 } 1607 1608 switch (*p) 1609 { 1610 uint loopCounter; 1611 1612 case 'x': 1613 isBinary = true; 1614 loopCounter = 1; 1615 case_Unicode: 1616 assert(c == 0 && loopCounter.In(1, 2, 4)); 1617 mid = MID.InsufficientHexDigits; 1618 while (loopCounter--) 1619 { // Decode two hex digits. 1620 dchar x = *++p; 1621 if (!hex2val(x)) 1622 goto Lerror; // Not a hexdigit. 1623 c = c << 4 | x; 1624 x = *++p; 1625 if (!hex2val(x)) 1626 goto Lerror; 1627 c = c << 4 | x; 1628 } 1629 ++p; 1630 if (!isValidChar(c)) 1631 { 1632 mid = MID.InvalidUnicodeEscapeSequence; 1633 goto Lerror; 1634 } 1635 break; 1636 case 'u': 1637 loopCounter = 2; 1638 goto case_Unicode; 1639 case 'U': 1640 loopCounter = 4; 1641 goto case_Unicode; 1642 default: 1643 size_t x = *p - '0'; 1644 if (x < 8) 1645 { // Octal sequence. 1646 isBinary = true; 1647 assert(c == 0); 1648 c = x; 1649 if ((x = *++p - '0') >= 8) 1650 break; 1651 c = c * 8 + x; 1652 if ((x = *++p - '0') >= 8) 1653 break; 1654 c = c * 8 + x; 1655 ++p; 1656 if (c <= 0xFF) 1657 break; 1658 mid = MID.InvalidOctalEscapeSequence; 1659 } 1660 else if (*p == '&') 1661 { 1662 if (isalpha(*++p)) 1663 { 1664 auto begin = p; 1665 while (isalnum(*++p)) 1666 {} 1667 1668 if (*p == ';') 1669 { // Pass entity excluding '&' and ';'. 1670 c = entity2Unicode(slice(begin, p)); 1671 ++p; // Skip ; 1672 if (c) 1673 goto Lreturn; // Return valid escape value. 1674 else 1675 mid = MID.UndefinedHTMLEntity; 1676 } 1677 else 1678 mid = MID.UnterminatedHTMLEntity; 1679 } 1680 else 1681 mid = MID.InvalidBeginHTMLEntity; 1682 } 1683 else if (isEndOfLine(p)) { 1684 mid = MID.UndefinedEscapeSequence; 1685 err_arg = isEOF(*p) ? `\EOF` : `\NewLine`; 1686 } 1687 else 1688 { 1689 auto tmp = "\\".dup; 1690 // TODO: check for non-printable character? 1691 encode(tmp, isascii(*p) ? *p : decodeUTF8(p)); 1692 err_arg = tmp; 1693 ++p; 1694 mid = MID.UndefinedEscapeSequence; 1695 } 1696 goto Lerror; 1697 } 1698 1699 Lreturn: 1700 ref_p = p; 1701 return c; 1702 1703 Lerror: 1704 if (!err_arg.length) 1705 err_arg = slice(ref_p, p); 1706 error(ref_p, mid, err_arg); 1707 ref_p = p; // Is at the beginning of the sequence. Update now. 1708 return REPLACEMENT_CHAR; // Error: return replacement character. 1709 } 1710 1711 /// Scans a number literal. 1712 /// 1713 /// $(BNF 1714 ////IntegerLiteral := (Dec | Hex | Bin | Oct) Suffix? 1715 ////Dec := "0" | [1-9] [\d_]* 1716 ////Hex := "0" [xX] "_"* HexDigits 1717 ////Bin := "0" [bB] "_"* [01] [01_]* 1718 ////Oct := "0" [0-7_]* 1719 ////Suffix := "L" [uU]? | [uU] "L"? 1720 ////) 1721 /// Invalid: "0b_", "0x_", "._" etc. 1722 void scanNumber(Token* t) 1723 { 1724 assert(isdigit(*p)); 1725 auto p = this.p; 1726 ulong ulong_; // The integer value. 1727 bool overflow; // True if an overflow was detected. 1728 bool isDecimal; // True for Dec literals. 1729 bool hasDecimalDigits; // To check for 8s and 9s in octal numbers. 1730 size_t digits; // Used to detect overflow in hex/bin numbers. 1731 size_t x; // Current digit value. 1732 1733 bool isfloat(char c) 1734 { // True if the decimal point '.' is not followed by: 1735 return c != '.' && !isidbeg(c) && isascii(c); 1736 } 1737 1738 if (*p != '0') 1739 goto LscanInteger; 1740 ++p; // Skip zero. 1741 // Check for xX bB ... 1742 switch (*p) 1743 { 1744 case 'x','X': 1745 goto LscanHex; 1746 case 'b','B': 1747 goto LscanBinary; 1748 case 'L': 1749 if (p[1] == 'i') 1750 goto LscanFloat; // 0Li 1751 break; // 0L 1752 case '.': 1753 if (!isfloat(p[1])) 1754 break; 1755 goto LscanFloat; // 0.[0-9] 1756 case 'i','f','F', // Imaginary and float literal suffixes. 1757 'e', 'E': // Float exponent. 1758 goto LscanFloat; 1759 default: 1760 if (*p == '_') 1761 goto LscanOctal; // 0_ 1762 else if ((x = *p - '0') < 10) 1763 if (x > 7) 1764 goto Loctal_hasDecimalDigits; // 08 or 09 1765 else 1766 goto Loctal_scannedFirstDigit; // 0[0-7] 1767 } 1768 1769 // Number 0 1770 assert(p[-1] == '0' && !isdigi_(*p) && ulong_ == 0); 1771 isDecimal = true; 1772 goto Lfinalize; 1773 1774 LscanInteger: 1775 assert(*p != '0' && isdigit(*p)); 1776 isDecimal = true; 1777 for (; 1; ++p) 1778 if ((x = *p - '0') < 10) 1779 { 1780 if (ulong_ < ulong.max/10 || (ulong_ == ulong.max/10 && x < 6)) 1781 ulong_ = ulong_ * 10 + x; 1782 else 1783 { // Overflow: skip following digits. 1784 overflow = true; 1785 while (isdigit(*++p)) 1786 {} 1787 break; 1788 } 1789 } 1790 else if (*p != '_') 1791 break; 1792 1793 // The number could be a float, so check overflow below. 1794 switch (*p) 1795 { 1796 case '.': 1797 if (isfloat(p[1])) 1798 goto LscanFloat; 1799 break; 1800 case 'L': 1801 if (p[1] != 'i') 1802 break; 1803 goto LscanFloat; 1804 case 'i', 'f', 'F', 'e', 'E': 1805 goto LscanFloat; 1806 default: 1807 } 1808 1809 if (overflow) 1810 error(t.start, MID.OverflowDecimalNumber); 1811 1812 assert(isdigi_(p[-1]) && !isdigi_(*p)); 1813 goto Lfinalize; 1814 1815 LscanHex: 1816 assert(digits == 0); 1817 assert((*p).In('x', 'X')); 1818 while (1) 1819 { 1820 x = *++p; 1821 if (hex2val(x)) 1822 { 1823 ulong_ = ulong_ << 4 | x; 1824 ++digits; 1825 } 1826 else if (*p != '_') 1827 break; 1828 } 1829 1830 assert((ishexa_(p[-1]) || p[-1].In('x', 'X')) && !ishexa_(*p)); 1831 1832 switch (*p) 1833 { 1834 case '.': 1835 if (!isfloat(p[1])) 1836 break; 1837 goto case; 1838 case 'p', 'P': 1839 this.p = p; 1840 return scanHexFloat(t); 1841 default: 1842 } 1843 1844 if (digits == 0 || digits > 16) 1845 error(t.start, 1846 digits == 0 ? MID.NoDigitsInHexNumber : MID.OverflowHexNumber); 1847 1848 goto Lfinalize; 1849 1850 LscanBinary: 1851 assert(digits == 0); 1852 assert((*p).In('b', 'B')); 1853 while (1) 1854 if ((x = *++p - '0') < 2) 1855 { 1856 ++digits; 1857 ulong_ = ulong_ * 2 + x; 1858 } 1859 else if (*p != '_') 1860 break; 1861 1862 if (digits == 0 || digits > 64) 1863 error(t.start, 1864 digits == 0 ? MID.NoDigitsInBinNumber : MID.OverflowBinaryNumber); 1865 1866 assert(p[-1].In('0', '1', '_', 'b', 'B'), p[-1] ~ ""); 1867 assert(!(*p).In('0', '1', '_')); 1868 goto Lfinalize; 1869 1870 LscanOctal: 1871 assert(*p == '_'); 1872 while (1) 1873 if ((x = *++p - '0') < 8) 1874 { 1875 if (ulong_ < ulong.max/2 || (ulong_ == ulong.max/2 && x < 2)) 1876 Loctal_scannedFirstDigit: 1877 ulong_ = ulong_ * 8 + x; 1878 else 1879 { // Overflow: skip following digits. 1880 overflow = true; 1881 while (isoctal(*++p)) 1882 {} 1883 break; 1884 } 1885 } 1886 else if (*p != '_') 1887 break; 1888 1889 if (isdigit(*p)) 1890 { 1891 Loctal_hasDecimalDigits: 1892 hasDecimalDigits = true; 1893 while (isdigit(*++p)) 1894 {} 1895 } 1896 1897 // The number could be a float, so check errors below. 1898 switch (*p) 1899 { 1900 case '.': 1901 if (isfloat(p[1])) 1902 goto LscanFloat; 1903 break; 1904 case 'L': 1905 if (p[1] != 'i') 1906 break; 1907 goto LscanFloat; 1908 case 'i', 'f', 'F', 'e', 'E': 1909 goto LscanFloat; 1910 default: 1911 } 1912 1913 version(D2) 1914 { 1915 if (ulong_ >= 8 || hasDecimalDigits) 1916 error(t.start, MID.OctalNumbersDeprecated); 1917 } 1918 else 1919 { 1920 if (hasDecimalDigits) 1921 error(t.start, MID.OctalNumberHasDecimals); 1922 if (overflow) 1923 error(t.start, MID.OverflowOctalNumber); 1924 } 1925 //goto Lfinalize; 1926 1927 Lfinalize: 1928 { 1929 enum Suffix 1930 { 1931 None = 0, 1932 Unsigned = 1, 1933 Long = 2 1934 } 1935 1936 // Scan optional suffix: L, Lu, LU, u, uL, U or UL. 1937 Suffix suffix; 1938 Loop: 1939 while (1) 1940 switch (*p) 1941 { 1942 case 'L': 1943 if (suffix & Suffix.Long) 1944 break Loop; 1945 suffix |= Suffix.Long; 1946 ++p; 1947 continue; 1948 case 'u', 'U': 1949 if (suffix & Suffix.Unsigned) 1950 break Loop; 1951 suffix |= Suffix.Unsigned; 1952 ++p; 1953 continue; 1954 default: 1955 break Loop; 1956 } 1957 1958 // Determine type of Integer. 1959 TOK kind; 1960 switch (suffix) 1961 { 1962 case Suffix.None: 1963 if (ulong_ & 0x8000_0000_0000_0000) 1964 { 1965 if (isDecimal) 1966 error(t.start, MID.OverflowDecimalSign); 1967 kind = T!"UInt64"; 1968 } 1969 else if (ulong_ & 0xFFFF_FFFF_0000_0000) 1970 kind = T!"Int64"; 1971 else if (ulong_ & 0x8000_0000) 1972 kind = isDecimal ? T!"Int64" : T!"UInt32"; 1973 else 1974 kind = T!"Int32"; 1975 break; 1976 case Suffix.Unsigned: 1977 if (ulong_ & 0xFFFF_FFFF_0000_0000) 1978 kind = T!"UInt64"; 1979 else 1980 kind = T!"UInt32"; 1981 break; 1982 case Suffix.Long: 1983 if (ulong_ & 0x8000_0000_0000_0000) 1984 { 1985 if (isDecimal) 1986 error(t.start, MID.OverflowDecimalSign); 1987 kind = T!"UInt64"; 1988 } 1989 else 1990 kind = T!"Int64"; 1991 break; 1992 case Suffix.Unsigned | Suffix.Long: 1993 kind = T!"UInt64"; 1994 break; 1995 default: 1996 assert(0); 1997 } 1998 1999 t.kind = kind; 2000 if (kind == T!"Int64" || kind == T!"UInt64") 2001 { 2002 version(X86_64) 2003 t.intval.ulong_ = ulong_; 2004 else 2005 t.intval = tables.lookupUlong(ulong_); 2006 } 2007 else 2008 t.uint_ = cast(uint)ulong_; 2009 t.end = this.p = p; 2010 return; 2011 } 2012 2013 LscanFloat: 2014 this.p = p; 2015 scanFloat(t); 2016 return; 2017 } 2018 2019 /// Returns a zero-terminated copy of the string where all 2020 /// underscores are removed. 2021 static char[] copySansUnderscores(cchar* begin, cchar* end) 2022 { 2023 auto s = String(begin, end + 1).dup; 2024 s[Neg(1)] = 0; 2025 return s.sub('_', "")[]; 2026 } 2027 2028 /// Scans a floating point number literal. 2029 /// 2030 /// $(BNF 2031 ////FloatLiteral := Float [fFL]? i? 2032 ////Float := DecFloat | HexFloat 2033 ////DecFloat := (DecDigits "." "_"* DecDigits? DecExponent?) | 2034 //// ("." DecDigits DecExponent?) 2035 //// (DecDigits DecExponent) 2036 ////DecExponent := [eE] [+-]? DecDigits 2037 ////DecDigits := \d [\d_]* 2038 ////) 2039 void scanFloat(Token* t) 2040 { 2041 auto p = this.p; 2042 if (*p == '.') 2043 { 2044 assert(p[1] != '.'); 2045 // This function was called by scan() or scanNumber(). 2046 while (isdigi_(*++p)) 2047 {} 2048 } 2049 else // This function was called by scanNumber(). 2050 assert((*p).In('i', 'f', 'F', 'e', 'E') || p[0..2] == "Li"); 2051 2052 // Scan exponent. 2053 if (*p == 'e' || *p == 'E') 2054 { 2055 ++p; 2056 if (*p == '-' || *p == '+') 2057 ++p; 2058 if (isdigit(*p)) 2059 while (isdigi_(*++p)) 2060 {} 2061 else 2062 error(p, MID.FloatExpMustStartWithDigit); 2063 } 2064 2065 this.p = p; 2066 finalizeFloat(t, copySansUnderscores(t.start, p)); 2067 } 2068 2069 /// Scans a hexadecimal floating point number literal. 2070 /// $(BNF 2071 ////HexFloat := "0" [xX] (HexDigits? "." HexDigits | HexDigits) HexExponent 2072 ////HexExponent := [pP] [+-]? DecDigits 2073 ////HexDigits := [a-fA-F\d] [a-fA-F\d_]* 2074 ////) 2075 void scanHexFloat(Token* t) 2076 { 2077 auto p = this.p; 2078 assert((*p).In('.', 'p', 'P')); 2079 MID mid = MID.HexFloatExponentRequired; 2080 if (*p == '.') 2081 while (ishexa_(*++p)) 2082 {} 2083 // Decimal exponent is required. 2084 if (*p != 'p' && *p != 'P') 2085 goto Lerror; 2086 // Scan exponent 2087 assert((*p).In('p', 'P')); 2088 ++p; 2089 if (*p == '+' || *p == '-') 2090 ++p; 2091 if (!isdigit(*p)) 2092 { 2093 mid = MID.HexFloatExpMustStartWithDigit; 2094 goto Lerror; 2095 } 2096 while (isdigi_(*++p)) 2097 {} 2098 2099 this.p = p; 2100 finalizeFloat(t, copySansUnderscores(t.start, p)); 2101 return; 2102 Lerror: 2103 t.kind = T!"Float32"; 2104 t.end = this.p = p; 2105 error(p, mid); 2106 } 2107 2108 /// Sets the value of the token. 2109 /// Params: 2110 /// t = Receives the value. 2111 /// float_string = The well-formed float number string. 2112 void finalizeFloat(Token* t, cstring float_string) 2113 { 2114 auto p = this.p; 2115 assert(float_string.length && float_string[$-1] == 0); 2116 // Finally check suffixes. 2117 TOK kind = void; 2118 if (*p == 'f' || *p == 'F') 2119 ++p, kind = T!"Float32"; 2120 else if (*p == 'L') 2121 ++p, kind = T!"Float80"; 2122 else 2123 kind = T!"Float64"; 2124 2125 if (*p == 'i') 2126 { 2127 ++p; 2128 kind += 3; // Switch to imaginary counterpart. 2129 assert(kind.In(T!"IFloat32", T!"IFloat64", T!"IFloat80")); 2130 } 2131 // TODO: test for overflow/underflow according to target platform. 2132 // CompilationContext must be passed to Lexer for this. 2133 auto f = lookupFloat(float_string); 2134 if (f.isPInf()) 2135 error(t.start, MID.OverflowFloatNumber); 2136 // else if (f.isNInf()) 2137 // error(t.start, MID.UnderflowFloatNumber); 2138 // else if (f.isNaN()) 2139 // error(t.start, MID.NaNFloat); 2140 t.mpfloat = f; 2141 t.kind = kind; 2142 t.end = this.p = p; 2143 return; 2144 } 2145 2146 /// Scans a special token sequence. 2147 /// 2148 /// $(BNF SpecialTokenSequence := "#line" Integer Filespec? EndOfLine) 2149 void scanSpecialTokenSequence(Token* t) 2150 { 2151 auto p = this.p; 2152 assert(*p == '#'); 2153 2154 auto hlval = new_!(Token.HashLineValue); 2155 2156 MID mid; 2157 cchar* errorAtColumn = p; 2158 cchar* tokenEnd = ++p; 2159 2160 if (*cast(uint*)p != chars_line) 2161 { 2162 mid = MID.ExpectedIdentifierSTLine; 2163 goto Lerror; 2164 } 2165 2166 { // Start of scanning code block. 2167 p += 4; 2168 tokenEnd = p; 2169 2170 // TODO: #line58"path/file" is legal. Require spaces? 2171 // State.Space could be used for that purpose. 2172 enum State 2173 { /+Space,+/ Integer, OptionalFilespec, End } 2174 2175 State state = State.Integer; 2176 2177 while (!isEndOfLine(p)) 2178 { 2179 if (isspace(*p)) 2180 {} 2181 else if (state == State.Integer) 2182 { 2183 if (!isdigit(*p)) 2184 { 2185 errorAtColumn = p; 2186 mid = MID.ExpectedIntegerAfterSTLine; 2187 goto Lerror; 2188 } 2189 auto newtok = new_!(Token); 2190 hlval.lineNum = newtok; 2191 this.p = p; 2192 scan(newtok); 2193 tokenEnd = p = this.p; 2194 if (newtok.kind != T!"Int32" && newtok.kind != T!"UInt32") 2195 { 2196 errorAtColumn = newtok.start; 2197 mid = MID.ExpectedIntegerAfterSTLine; 2198 goto Lerror; 2199 } 2200 state = State.OptionalFilespec; 2201 continue; 2202 } 2203 else if (state == State.OptionalFilespec && *p == '"') 2204 { 2205 auto fs = hlval.filespec = new_!(Token); 2206 fs.start = p; 2207 fs.kind = T!"Filespec"; 2208 // Skip until closing '"'. 2209 while (*++p != '"' && !isEndOfLine(p)) 2210 isascii(*p) || decodeUTF8(p); 2211 if (*p != '"') 2212 { // Error. 2213 errorAtColumn = fs.start; 2214 mid = MID.UnterminatedFilespec; 2215 fs.end = p; 2216 tokenEnd = p; 2217 goto Lerror; 2218 } 2219 auto str = slice(fs.start + 1, p); // Get string excluding "". 2220 fs.strval = lookupString(str, '\0'); 2221 fs.end = tokenEnd = ++p; 2222 state = State.End; 2223 continue; 2224 } 2225 else/+ if (state == State.End)+/ 2226 { 2227 errorAtColumn = tokenEnd; 2228 mid = MID.UnterminatedSpecialToken; 2229 goto Lerror; 2230 } 2231 ++p; 2232 } 2233 assert(isEndOfLine(p)); 2234 2235 if (state == State.Integer) 2236 { 2237 errorAtColumn = p; 2238 mid = MID.ExpectedIntegerAfterSTLine; 2239 goto Lerror; 2240 } 2241 } // End of scanning code block. 2242 2243 // Evaluate #line only when not in token string. 2244 if (!inTokenString && hlval.lineNum) 2245 { 2246 if (!hlinfo) 2247 { 2248 hlinfo = new_!(Token.HashLineInfo); 2249 hlinfo.path = srcText.filePath; 2250 } 2251 hlinfo.setLineNum(this.lineNum, hlval.lineNum.sizet_); 2252 if (hlval.filespec) 2253 hlinfo.path = cast(cstring)hlval.filespec.strval.str; 2254 } 2255 2256 if (0) // Only issue an error if jumped here. 2257 Lerror: 2258 error(errorAtColumn, mid); 2259 2260 t.kind = TOK.HashLine; 2261 t.hlval = hlval; 2262 t.end = this.p = tokenEnd; 2263 return; 2264 } 2265 2266 /// Returns the error line number. 2267 size_t errorLineNumber(size_t lineNum) 2268 { 2269 if (hlinfo) 2270 lineNum -= hlinfo.lineNum; 2271 return lineNum; 2272 } 2273 2274 /// Returns the file path for error messages. 2275 cstring errorFilePath() 2276 { 2277 return hlinfo ? hlinfo.path : srcText.filePath; 2278 } 2279 2280 /// Forwards error parameters. 2281 void error(cchar* columnPos, MID mid, ...) 2282 { 2283 error(_arguments, _argptr, this.lineLoc, columnPos, diag.bundle.msg(mid)); 2284 } 2285 2286 /// ditto 2287 void error(LineLoc line, cchar* columnPos, MID mid, ...) 2288 { 2289 error(_arguments, _argptr, line, columnPos, diag.bundle.msg(mid)); 2290 } 2291 2292 /// Creates an error report and appends it to a list. 2293 /// Params: 2294 /// line = The line number and pointer to the first character of a line. 2295 /// columnPos = Points to the character where the error is located. 2296 /// msg = The error message. 2297 void error(TypeInfo[] _arguments, va_list _argptr, 2298 LineLoc line, cchar* columnPos, cstring msg) 2299 { 2300 line.n = this.errorLineNumber(line.n); 2301 auto errorPath = errorFilePath(); 2302 auto location = new Location(errorPath, line.n, line.p, columnPos); 2303 msg = diag.format(_arguments, _argptr, msg); 2304 auto error = new LexerError(location, msg); 2305 errors ~= error; 2306 diag ~= error; 2307 } 2308 2309 /// Returns true if the current character to be decoded is 2310 /// a Unicode alpha character. 2311 /// Params: 2312 /// ref_p = Is set to the last trail byte if true is returned. 2313 static bool scanUnicodeAlpha(ref cchar* ref_p) 2314 { 2315 auto p = ref_p; 2316 assert(!isascii(*p), 2317 "check for ASCII char before calling scanUnicodeAlpha()."); 2318 dchar d = *p; 2319 ++p; // Move to second byte. 2320 // Error if second byte is not a trail byte. 2321 if (!isTrailByte(*p)) 2322 return false; 2323 // Check for overlong sequences. 2324 if (d.In(0xE0, 0xF0, 0xF8, 0xFC) && (*p & d) == 0x80 || 2325 (d & 0xFE) == 0xC0) // 1100000x 2326 return false; 2327 const string checkNextByte = "if (!isTrailByte(*++p))" 2328 " return false;"; 2329 const string appendSixBits = "d = (d << 6) | *p & 0b0011_1111;"; 2330 // Decode 2331 if ((d & 0b1110_0000) == 0b1100_0000) 2332 { 2333 d &= 0b0001_1111; 2334 mixin(appendSixBits); 2335 } 2336 else if ((d & 0b1111_0000) == 0b1110_0000) 2337 { 2338 d &= 0b0000_1111; 2339 mixin(appendSixBits ~ 2340 checkNextByte ~ appendSixBits); 2341 } 2342 else if ((d & 0b1111_1000) == 0b1111_0000) 2343 { 2344 d &= 0b0000_0111; 2345 mixin(appendSixBits ~ 2346 checkNextByte ~ appendSixBits ~ 2347 checkNextByte ~ appendSixBits); 2348 } 2349 else 2350 return false; 2351 2352 assert(isTrailByte(*p)); 2353 if (!isValidChar(d) || !isUniAlpha(d)) 2354 return false; 2355 // Only advance pointer if this is a Unicode alpha character. 2356 ref_p = p; 2357 return true; 2358 } 2359 2360 /// Decodes the next UTF-8 sequence. 2361 /// 2362 /// Params: 2363 /// ref_p = Set to the last trail byte. 2364 dchar decodeUTF8(ref cchar* ref_p) 2365 { 2366 auto p = ref_p; 2367 assert(!isascii(*p), "check for ASCII char before calling decodeUTF8()."); 2368 dchar d = *p; 2369 2370 ++p; // Move to second byte. 2371 // Error if second byte is not a trail byte. 2372 if (!isTrailByte(*p)) 2373 goto Lerror2; 2374 2375 // Check for overlong sequences. 2376 if (d.In(0xE0, 0xF0, 0xF8, 0xFC) && (*p & d) == 0x80 || 2377 (d & 0xFE) == 0xC0) // 1100000x 2378 goto Lerror; 2379 2380 enum checkNextByte = "if (!isTrailByte(*++p))" 2381 " goto Lerror2;"; 2382 enum appendSixBits = "d = (d << 6) | *p & 0b0011_1111;"; 2383 2384 // See how many bytes need to be decoded. 2385 if ((d & 0b1110_0000) == 0b1100_0000) 2386 { // 110xxxxx 10xxxxxx 2387 d &= 0b0001_1111; 2388 goto L2Bytes; 2389 } 2390 else if ((d & 0b1111_0000) == 0b1110_0000) 2391 { // 1110xxxx 10xxxxxx 10xxxxxx 2392 d &= 0b0000_1111; 2393 goto L3Bytes; 2394 } 2395 else if ((d & 0b1111_1000) == 0b1111_0000) 2396 { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 2397 d &= 0b0000_0111; 2398 goto L4Bytes; 2399 } 2400 else 2401 // 5 and 6 byte UTF-8 sequences are not allowed yet. 2402 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 2403 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 2404 goto Lerror; 2405 2406 // Decode the bytes now. 2407 L4Bytes: 2408 mixin(appendSixBits); 2409 mixin(checkNextByte); 2410 L3Bytes: 2411 mixin(appendSixBits); 2412 mixin(checkNextByte); 2413 L2Bytes: 2414 mixin(appendSixBits); 2415 2416 assert(isTrailByte(*p)); 2417 2418 if (!isValidChar(d)) 2419 { 2420 Lerror: 2421 // Three cases: 2422 // *) the UTF-8 sequence was successfully decoded but the resulting 2423 // character is invalid. 2424 // p points to last trail byte in the sequence. 2425 // *) the UTF-8 sequence is overlong. 2426 // p points to second byte in the sequence. 2427 // *) the UTF-8 sequence has more than 4 bytes or starts with 2428 // a trail byte. 2429 // p points to second byte in the sequence. 2430 assert(isTrailByte(*p)); 2431 // Move to next ASCII character or lead byte of a UTF-8 sequence. 2432 while (*p && !isValidLead(*p)) 2433 ++p; 2434 --p; 2435 assert(!isTrailByte(p[1]) && p < this.endX()); 2436 Lerror2: 2437 d = REPLACEMENT_CHAR; 2438 error(ref_p, MID.InvalidUTF8Sequence, formatBytes(ref_p, p)); 2439 } 2440 // Advance the pointer and return. 2441 ref_p = p; 2442 return d; 2443 } 2444 2445 /// Encodes the character d and appends it to str. 2446 static void encodeUTF8(ref CharArray str, dchar d) 2447 { 2448 assert(!isascii(d), "check for ASCII char before calling encodeUTF8()."); 2449 assert(isValidChar(d), "cannot encode invalid char in encodeUTF8()."); 2450 2451 auto count = d < 0x800 ? 2 : (d < 0x10000 ? 3 : 4); 2452 if (count > str.rem) // Not enough space? 2453 str.rem = count; 2454 auto p = str.cur; 2455 str.cur += count; 2456 if (d < 0x800) 2457 { 2458 p[0] = 0xC0 | cast(char)(d >> 6); 2459 p[1] = 0x80 | (d & 0x3F); 2460 } 2461 else if (d < 0x10000) 2462 { 2463 p[0] = 0xE0 | cast(char)(d >> 12); 2464 p[1] = 0x80 | ((d >> 6) & 0x3F); 2465 p[2] = 0x80 | (d & 0x3F); 2466 } 2467 else if (d < 0x200000) 2468 { 2469 p[0] = 0xF0 | (d >> 18); 2470 p[1] = 0x80 | ((d >> 12) & 0x3F); 2471 p[2] = 0x80 | ((d >> 6) & 0x3F); 2472 p[3] = 0x80 | (d & 0x3F); 2473 } 2474 else 2475 assert(0); 2476 } 2477 2478 /// Formats the bytes between start and end (excluding end.) 2479 /// Returns: e.g.: "abc" -> "\x61\x62\x63" 2480 static cstring formatBytes(cchar* start, cchar* end) 2481 { 2482 const formatLen = 4; // `\xXX`.length 2483 const H = "0123456789ABCDEF"; // Hex numerals. 2484 auto strLen = end-start; 2485 char[] result = new char[strLen*formatLen]; // Allocate space. 2486 char* p = result.ptr; 2487 foreach (c; start[0..strLen]) 2488 (*p++ = '\\'), (*p++ = 'x'), (*p++ = H[c>>4]), (*p++ = H[c&0x0F]); 2489 assert(p is result.ptr+result.length); 2490 return result; 2491 } 2492 2493 /// Searches for an invalid UTF-8 sequence in str. 2494 /// Returns: a formatted string of the invalid sequence (e.g. "\xC0\x80"). 2495 static cstring findInvalidUTF8Sequence(cbinstr bstr) 2496 { 2497 auto str = cast(cstring)bstr; 2498 auto p = str.ptr, end = p + str.length; 2499 while (p < end) 2500 if (decode(p, end) == ERROR_CHAR) 2501 { 2502 auto begin = p; 2503 // Skip trail-bytes. 2504 while (++p < end && !isValidLead(*p)) 2505 {} 2506 return Lexer.formatBytes(begin, p); 2507 } 2508 assert(p == end); 2509 return null; 2510 } 2511 } // End of Lexer 2512 2513 /// Tests the lexer with a list of tokens. 2514 void testLexer() 2515 { 2516 scope msg = new UnittestMsg("Testing class Lexer."); 2517 struct Pair 2518 { 2519 string tokenText; 2520 TOK kind; 2521 } 2522 static Pair[] pairs = [ 2523 {"#!äöüß", TOK.Shebang}, {"\n", TOK.Newline}, 2524 {"//çay", TOK.Comment}, {"\n", TOK.Newline}, 2525 {"&", TOK.Amp}, 2526 {"/*çağ*/", TOK.Comment}, {"&&", TOK.Amp2}, 2527 {"/+çak+/", TOK.Comment}, {"&=", TOK.AmpEql}, 2528 {">", TOK.Greater}, {"+", TOK.Plus}, 2529 {">=", TOK.GreaterEql}, {"++", TOK.Plus2}, 2530 {">>", TOK.Greater2}, {"+=", TOK.PlusEql}, 2531 {">>=", TOK.Greater2Eql}, {"-", TOK.Minus}, 2532 {">>>", TOK.Greater3}, {"--", TOK.Minus2}, 2533 {">>>=", TOK.Greater3Eql}, {"-=", TOK.MinusEql}, 2534 {"<", TOK.Less}, {"=", TOK.Equal}, 2535 {"<=", TOK.LessEql}, {"==", TOK.Equal2}, 2536 {"<>", TOK.LorG}, {"~", TOK.Tilde}, 2537 {"<>=", TOK.LorEorG}, {"~=", TOK.TildeEql}, 2538 {"<<", TOK.Less2}, {"*", TOK.Star}, 2539 {"<<=", TOK.Less2Eql}, {"*=", TOK.StarEql}, 2540 {"!", TOK.Exclaim}, {"/", TOK.Slash}, 2541 {"!=", TOK.ExclaimEql}, {"/=", TOK.SlashEql}, 2542 {"!<", TOK.UorGorE}, {"^", TOK.Caret}, 2543 {"!>", TOK.UorLorE}, {"^=", TOK.CaretEql}, 2544 {"!<=", TOK.UorG}, {"%", TOK.Percent}, 2545 {"!>=", TOK.UorL}, {"%=", TOK.PercentEql}, 2546 {"!<>", TOK.UorE}, {"(", TOK.LParen}, 2547 {"!<>=", TOK.Unordered}, {")", TOK.RParen}, 2548 {".", TOK.Dot}, {"[", TOK.LBracket}, 2549 {"..", TOK.Dot2}, {"]", TOK.RBracket}, 2550 {"...", TOK.Dot3}, {"{", TOK.LBrace}, 2551 {"|", TOK.Pipe}, {"}", TOK.RBrace}, 2552 {"||", TOK.Pipe2}, {":", TOK.Colon}, 2553 {"|=", TOK.PipeEql}, {";", TOK.Semicolon}, 2554 {"?", TOK.Question}, {",", TOK.Comma}, 2555 {"$", TOK.Dollar}, {"cam", TOK.Identifier}, 2556 {"çay", TOK.Identifier}, {".0", TOK.Float64}, 2557 {"0", TOK.Int32}, {"\n", TOK.Newline}, 2558 {"\r", TOK.Newline}, {"\r\n", TOK.Newline}, 2559 {"\u2028", TOK.Newline}, {"\u2029", TOK.Newline}, 2560 {"'c'", TOK.Character}, {`'\''`, TOK.Character}, 2561 {`"dblq"`, TOK.String}, {"`raw`", TOK.String}, 2562 {`r"aw"`, TOK.String}, {`x"0123456789abcdef"`, TOK.String}, 2563 ]; 2564 2565 version(D2) 2566 { 2567 static Pair[] pairs2 = [ 2568 {"@", TOK.At}, 2569 {"^^", TOK.Caret2}, 2570 {"^^=", TOK.Caret2Eql}, 2571 {"=>", TOK.EqlGreater}, 2572 {"q\"ⱷ\n\nⱷ\"", TOK.String}, {`q"(())"`, TOK.String}, 2573 {`q"{{}}"`, TOK.String}, {`q"[[]]"`, TOK.String}, 2574 {`q"<<>>"`, TOK.String}, {`q"/__/"`, TOK.String}, 2575 {`q"∆⟵✻⟶∆"`, TOK.String}, {`q"\⣯⣻\"`, TOK.String}, 2576 {"q{toks...}", TOK.String}, {"q{({#line 0\n})}", TOK.String}, 2577 {"q\"HDOC\nq\"***\"\nHDOC\"", TOK.String}, 2578 {"q\"ȨÖF\nq{***}\nȨÖF\"", TOK.String}, 2579 {`q{q"<>"q"()"q"[]"q"{}"q"//"q"\\"q{}}`, TOK.String}, 2580 ]; 2581 } 2582 else // D1 2583 { 2584 static Pair[] pairs2 = [ 2585 {"\\n", TOK.String}, {"\\u2028", TOK.String} 2586 ]; 2587 } 2588 pairs ~= pairs2; 2589 2590 char[] src; // The source text to be scanned. 2591 2592 // Join all token texts into a single string. 2593 foreach (i, pair; pairs) 2594 if (pair.kind == TOK.Comment && 2595 pair.tokenText[1] == '/' || // Line comment. 2596 pair.kind == TOK.Shebang) 2597 { 2598 assert(pairs[i+1].kind == TOK.Newline); // Must be followed by a newline. 2599 src ~= pair.tokenText; 2600 } 2601 else 2602 src ~= pair.tokenText ~ " "; 2603 2604 // Lex the constructed source text. 2605 auto tables = new LexerTables(); 2606 auto lx = new Lexer(new SourceText("lexer_unittest", src), tables); 2607 lx.scanAll(); 2608 2609 foreach (e; lx.errors) 2610 Stdout.formatln("{}({},{})L: {}", e.filePath, e.loc, e.col, e.getMsg); 2611 2612 auto token = lx.firstToken, last = lx.lastToken; 2613 2614 for (size_t i; i < pairs.length && token < last; ++i, ++token) 2615 if (token.text != pairs[i].tokenText) 2616 assert(0, Format("Scanned ‘{0}’ but expected ‘{1}’", 2617 escapeNonPrintable(token.text), pairs[i].tokenText)); 2618 } 2619 2620 /// Tests the Lexer's peek() method. 2621 void testLexerPeek() 2622 { 2623 scope msg = new UnittestMsg("Testing method Lexer.peek()"); 2624 auto tables = new LexerTables(); 2625 auto sourceText = new SourceText("", "unittest { }"); 2626 auto lx = new Lexer(sourceText, tables); 2627 lx.scanAll(); 2628 2629 auto next = lx.head; 2630 lx.peek(next); 2631 assert(next.kind == TOK.Newline); 2632 lx.peek(next); 2633 assert(next.kind == TOK.Unittest); 2634 lx.peek(next); 2635 assert(next.kind == TOK.LBrace); 2636 lx.peek(next); 2637 assert(next.kind == TOK.RBrace); 2638 lx.peek(next); 2639 assert(next.kind == TOK.EOF); 2640 2641 lx = new Lexer(new SourceText("", ""), tables); 2642 lx.scanAll(); 2643 next = lx.head; 2644 lx.peek(next); 2645 assert(next.kind == TOK.Newline); 2646 lx.peek(next); 2647 assert(next.kind == TOK.EOF); 2648 } 2649 2650 void testLexerNumbers() 2651 { 2652 // Numbers unittest 2653 // 0L 0ULi 0_L 0_UL 0x0U 0x0p2 0_Fi 0_e2 0_F 0_i 2654 // 0u 0U 0uL 0UL 0L 0LU 0Lu 2655 // 0Li 0f 0F 0fi 0Fi 0i 2656 // 0b_1_LU 0b1000u 2657 // 0x232Lu 2658 }