1 /// Author: Aziz Köksal 2 /// License: GPL3 3 /// $(Maturity very high) 4 module dil.lexer.Funcs; 5 6 import dil.Unicode; 7 import dil.String : slice; 8 import dil.Array; 9 import common; 10 11 const char[3] LS = "\u2028"; /// Unicode line separator. 12 const dchar LSd = 0x2028; /// ditto 13 const char[3] PS = "\u2029"; /// Unicode paragraph separator. 14 const dchar PSd = 0x2029; /// ditto 15 static assert(LS[0] == PS[0] && LS[1] == PS[1]); 16 17 const dchar _Z_ = 26; /// Control+Z. 18 19 /// Casts a string to an integer at compile-time. 20 /// Allows for fast string comparison using integers: 21 /// *cast(uint*)"\xAA\xBB\xCC\xDD".ptr == castInt("\xAA\xBB\xCC\xDD") 22 static size_t castInt(cstring s) 23 { 24 assert(s.length <= size_t.sizeof); 25 size_t x; 26 foreach (i, c; s) 27 version(BigEndian) 28 x = (x << 8) | c; // Add c as LSByte. 29 else 30 x |= (c << i*8); // Add c as MSByte. 31 return x; 32 } 33 version(LittleEndian) 34 static assert(castInt("\xAA\xBB\xCC\xDD") == 0xDDCCBBAA && 35 castInt("\xAB\xCD\xEF") == 0xEFCDAB && castInt("\xAB\xCD") == 0xCDAB); 36 else 37 static assert(castInt("\xAA\xBB\xCC\xDD") == 0xAABBCCDD && 38 castInt("\xAB\xCD\xEF") == 0xABCDEF && castInt("\xAB\xCD") == 0xABCD); 39 40 /// Returns: true if d is a Unicode line or paragraph separator. 41 bool isUnicodeNewlineChar(dchar d) 42 { 43 return d == LSd || d == PSd; 44 } 45 46 /// Returns: true if p points to a line or paragraph separator. 47 bool isUnicodeNewline(cchar* p) 48 { 49 return *p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]); 50 } 51 52 /// Returns: true if p points to the start of a Newline. 53 /// $(BNF 54 ////Newline := "\n" | "\r" | "\r\n" | LS | PS 55 ////LS := "\u2028" 56 ////PS := "\u2029" 57 ////) 58 bool isNewline(cchar* p) 59 { 60 return *p == '\n' || *p == '\r' || isUnicodeNewline(p); 61 } 62 63 /// Returns: true if c is a Newline character. 64 bool isNewline(dchar c) 65 { 66 return c == '\n' || c == '\r' || isUnicodeNewlineChar(c); 67 } 68 69 /// Returns: true if p points to an EOF character. 70 /// $(BNF 71 ////EOF := "\0" | _Z_ 72 ////_Z_ := "\x1A" 73 ////) 74 bool isEOF(dchar c) 75 { 76 return c == 0 || c == _Z_; 77 } 78 79 /// Returns: true if p points to the first character of an EndOfLine. 80 /// $(BNF EndOfLine := Newline | EOF) 81 bool isEndOfLine(cchar* p) 82 { 83 return isNewline(p) || isEOF(*p); 84 } 85 86 /// Scans a Newline and sets p one character past it. 87 /// Returns: true if found or false otherwise. 88 bool scanNewline(ref cchar* p) 89 in { assert(p); } 90 body 91 { 92 switch (*p) 93 { 94 case '\r': 95 if (p[1] == '\n') 96 ++p; 97 goto case; 98 case '\n': 99 ++p; 100 break; 101 default: 102 if (isUnicodeNewline(p)) 103 p += 3; 104 else 105 return false; 106 } 107 return true; 108 } 109 110 /// Scans a Newline and sets p one character past it. 111 /// Returns: true if found or false otherwise. 112 bool scanNewline(ref cchar* p, cchar* end) 113 in { assert(p && p < end); } 114 body 115 { 116 switch (*p) 117 { 118 case '\r': 119 if (p+1 < end && p[1] == '\n') 120 ++p; 121 goto case; 122 case '\n': 123 ++p; 124 break; 125 default: 126 if (p+2 < end && isUnicodeNewline(p)) 127 p += 3; 128 else 129 return false; 130 } 131 return true; 132 } 133 134 /// Scans a Newline in reverse direction and sets end 135 /// on the first character of the newline. 136 /// Returns: true if found or false otherwise. 137 bool scanNewlineReverse(cchar* begin, ref cchar* end) 138 { 139 switch (*end) 140 { 141 case '\n': 142 if (begin <= end-1 && end[-1] == '\r') 143 end--; 144 goto case; 145 case '\r': 146 break; 147 case LS[2], PS[2]: 148 if (begin <= end-2 && end[-1] == LS[1] && end[-2] == LS[0]) { 149 end -= 2; 150 break; 151 } 152 goto default; 153 default: 154 return false; 155 } 156 return true; 157 } 158 159 /// Scans a D identifier. 160 /// Params: 161 /// ref_p = Where to start. 162 /// end = Where it ends. 163 /// Returns: the identifier if valid (sets ref_p one past the id,) or 164 /// null if invalid (leaves ref_p unchanged.) 165 cstring scanIdentifier(ref cchar* ref_p, cchar* end) 166 in { assert(ref_p && ref_p < end); } 167 body 168 { 169 auto p = ref_p; 170 if (isidbeg(*p) || scanUnicodeAlpha(p, end)) // IdStart 171 { 172 do // IdChar* 173 p++; 174 while (p < end && (isident(*p) || scanUnicodeAlpha(p, end))); 175 auto identifier = slice(ref_p, p); 176 ref_p = p; 177 return identifier; 178 } 179 return null; 180 } 181 182 /// Returns true if p points to the start of a D identifier. 183 bool isIdentifierStart(cchar* p, cchar* end) 184 { 185 return isidbeg(*p) || isUnicodeAlpha(p, end); 186 } 187 188 /// Returns s with non-printable characters escaped. 189 cstring escapeNonPrintable(cstring s) 190 { 191 char[16] buffer; 192 CharArray s2; 193 size_t i, prev; 194 while (i < s.length) 195 { 196 auto j = i; // Remember index of the current character. 197 auto c = decode(s, i); 198 if (i == j) 199 c = s[i++] | 1<<31; // Error decoding char: set special flag. 200 if (auto n = escapeNonPrintable(c, buffer.ptr)) 201 { 202 if (!prev) // Reserve space when appending the first time. 203 s2.cap = s.length + n - (i-j); 204 s2 ~= s[prev..j]; // Previous unescaped string. 205 s2 ~= buffer[0..n]; // Escape sequence. 206 prev = i; 207 } 208 } 209 if (prev && prev != s.length) 210 s2 ~= s[prev..$]; 211 return s2.ptr ? s2[] : s; 212 } 213 214 /// Returns an escape sequence if c is not printable. 215 cstring escapeNonPrintable(dchar c) 216 { 217 char[16] buffer; 218 if (auto n = escapeNonPrintable(c, buffer.ptr)) 219 return buffer[0..n].dup; 220 else 221 return encode(buffer.ptr, c).dup; 222 } 223 224 /// Writes an escape sequence to p if c is not printable. 225 /// Returns the number of characters written. 226 size_t escapeNonPrintable(dchar c, char* p) 227 { 228 enum H = "0123456789ABCDEF"; // Hex numerals. 229 size_t n; // Number of bytes written. 230 if (isascii(c)) 231 { // ASCII 232 switch (c) 233 { 234 case '\0': c = '0'; goto Lcommon; 235 case '\a': c = 'a'; goto Lcommon; 236 case '\b': c = 'b'; goto Lcommon; 237 case '\f': c = 'f'; goto Lcommon; 238 case '\n': c = 'n'; goto Lcommon; 239 case '\r': c = 'r'; goto Lcommon; 240 case '\t': c = 't'; goto Lcommon; 241 case '\v': c = 'v'; goto Lcommon; 242 Lcommon: 243 p[0..n=2] = ['\\', cast(char)c]; 244 break; 245 default: 246 if (c < 0x20 || c == 0x7F) // Special non-printable characters. 247 goto LoneByte; 248 } 249 } 250 else 251 { // UNICODE 252 // TODO: write function isUniPrintable() similar to isUniAlpha(). 253 if (0x80 >= c && c <= 0x9F) // C1 control character set. 254 p[0..n=6] = ['\\', 'u', '0', '0', H[c>>4], H[c & 0x0F]]; 255 if (c == '\u2028' || c == '\u2029') 256 p[0..n=6] = ['\\', 'u', '2', '0', '2', H[c & 0x0F]]; 257 else if (!isValidChar(c)) 258 { 259 if (c & 1<<31) // Check for the flag that forces a \xYY encoding. 260 c &= 0xFF; 261 if (c <= 0xFF) 262 LoneByte: 263 p[0..n=4] = ['\\', 'x', H[c>>4], H[c & 0x0F]]; 264 else if (c <= 0xFFFF) 265 p[0..n=8] = ['\\', 'x', H[c>>12], H[c>>8 & 0x0F], 266 '\\', 'x', H[c>>4 & 0x0F], H[c & 0x0F]]; 267 else 268 p[0..n=16] = ['\\', 'x', H[c>>28], H[c>>24 & 0x0F], 269 '\\', 'x', H[c>>20 & 0x0F], H[c>>16 & 0x0F], 270 '\\', 'x', H[c>>12 & 0x0F], H[c>>8 & 0x0F], 271 '\\', 'x', H[c>>4 & 0x0F], H[c & 0x0F]]; 272 } 273 } 274 return n; 275 } 276 277 278 /// ASCII character properties table. 279 static const int ptable[256] = [ 280 0, 0, 0, 0, 0, 0, 0, 0, 0,32, 0,32,32, 0, 0, 0, 281 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 282 32, 0, 0x2200, 0, 0, 0, 0, 0x2700, 0, 0, 0, 0, 0, 0, 0, 0, 283 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 0, 0, 0, 0, 0, 0x3f00, 284 0,12,12,12,12,12,12, 8, 8, 8, 8, 8, 8, 8, 8, 8, 285 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0x5c00, 0, 0,16, 286 0, 0x70c, 0x80c,12,12,12, 0xc0c, 8, 8, 8, 8, 8, 8, 8, 0xa08, 8, 287 8, 8, 0xd08, 8, 0x908, 8, 0xb08, 8, 8, 8, 8, 0, 0, 0, 0, 0, 288 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 290 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 291 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 292 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 293 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 294 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 295 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 296 ]; 297 298 /// Enumeration of character property flags. 299 enum CProperty 300 { 301 Octal = 1, /// [0-7] 302 Digit = 1<<1, /// [0-9] 303 Hex = 1<<2, /// [0-9a-fA-F] 304 Alpha = 1<<3, /// [a-zA-Z] 305 Underscore = 1<<4, /// [_] 306 Whitespace = 1<<5 /// [ \t\v\f] 307 } 308 309 const uint EVMask = 0xFF00; // Bit mask for escape value. 310 311 private alias CP = CProperty; 312 /// Returns: true if c is an octal digit. 313 int isoctal(char c) { return ptable[c] & CP.Octal; } 314 /// Returns: true if c is a decimal digit. 315 int isdigit(char c) { return ptable[c] & CP.Digit; } 316 /// ditto 317 int isdigit(uint c) { return isdigit(cast(char)c); } 318 /// Returns: true if c is a decimal digit or '_'. 319 int isdigi_(char c) { return ptable[c] & (CP.Digit | CP.Underscore); } 320 /// Returns: true if c is a hexadecimal digit. 321 int ishexad(char c) { return ptable[c] & CP.Hex; } 322 /// ditto 323 int ishexad(uint c) { return ishexad(cast(char)c); } 324 /// Returns: true if c is a hexadecimal digit or '_'. 325 int ishexa_(char c) { return ptable[c] & (CP.Hex | CP.Underscore); } 326 /// Returns: true if c is a letter. 327 int isalpha(char c) { return ptable[c] & CP.Alpha; } 328 /// Returns: true if c is an alphanumeric. 329 int isalnum(char c) { return ptable[c] & (CP.Alpha | CP.Digit); } 330 /// Returns: true if c is the beginning of a D identifier (only ASCII.) 331 int isidbeg(char c) { return ptable[c] & (CP.Alpha | CP.Underscore); } 332 /// ditto 333 int isidbeg(dchar c) { return isidbeg(cast(char)c); } 334 /// ditto 335 int isidbeg(uint c) { return isidbeg(cast(char)c); } 336 /// Returns: true if c is a D identifier character (only ASCII.) 337 int isident(char c) { return ptable[c] & (CP.Alpha|CP.Underscore|CP.Digit); } 338 /// ditto 339 int isident(uint c) { return isident(cast(char)c); } 340 /// Returns: true if c is a whitespace character. 341 int isspace(char c) { return ptable[c] & CP.Whitespace; } 342 /// ditto 343 int isspace(uint c) { return isspace(cast(char)c); } 344 /// Returns: the escape value for c. 345 int char2ev(char c) { return ptable[c] >> 8; /*(ptable[c] & EVMask) >> 8;*/ } 346 /// Returns: true if c is an ASCII character. 347 int isascii(uint c) { return c < 128; } 348 349 /// Returns true if the string is empty or has only whitespace characters. 350 bool isAllSpace(cchar* start, cchar* end) 351 { 352 for (; start < end; start++) 353 if (!isspace(*start)) 354 return false; 355 return true; 356 } 357 358 /// Converts c to its hexadecimal value. Returns false if c isn't a hex digit. 359 bool hex2val(Char)(ref Char c) 360 { 361 if (c - '0' < 10) 362 c -= '0'; 363 else if ((c|0x20) - 'a' < 6) // 'A'|0x20 == 'a' 364 c = cast(Char)((c|0x20) - 'a' + 10); 365 else 366 return false; 367 return true; 368 } 369 370 version(gen_ptable) 371 static this() 372 { 373 alias p = ptable; 374 assert(p.length == 256); 375 // Initialize character properties table. 376 for (size_t i; i < p.length; ++i) 377 { 378 p[i] = 0; // Reset 379 if ('0' <= i && i <= '7') 380 p[i] |= CP.Octal; 381 if ('0' <= i && i <= '9') 382 p[i] |= CP.Digit | CP.Hex; 383 if ('a' <= i && i <= 'f' || 'A' <= i && i <= 'F') 384 p[i] |= CP.Hex; 385 if ('a' <= i && i <= 'z' || 'A' <= i && i <= 'Z') 386 p[i] |= CP.Alpha; 387 if (i == '_') 388 p[i] |= CP.Underscore; 389 if (i == ' ' || i == '\t' || i == '\v' || i == '\f') 390 p[i] |= CP.Whitespace; 391 } 392 // Store escape sequence values in second byte. 393 assert(CProperty.max <= ubyte.max, 394 "character property flags and escape value byte overlap."); 395 p['\''] |= 39 << 8; 396 p['"'] |= 34 << 8; 397 p['?'] |= 63 << 8; 398 p['\\'] |= 92 << 8; 399 p['a'] |= 7 << 8; 400 p['b'] |= 8 << 8; 401 p['f'] |= 12 << 8; 402 p['n'] |= 10 << 8; 403 p['r'] |= 13 << 8; 404 p['t'] |= 9 << 8; 405 p['v'] |= 11 << 8; 406 // Print a formatted array literal. 407 char[] array = "[\n".dup; 408 foreach (i, c; ptable) 409 { 410 array ~= Format((c>255?" 0x{0:x},":"{0,2},"), c) ~ (((i+1) % 16) ? "":"\n"); 411 } 412 array[$-2..$] = "\n]"; 413 Stdout(array).newline; 414 }