1 /// Author: Aziz Köksal
2 /// License: GPL3
3 /// $(Maturity very high)
4 module dil.lexer.Funcs;
5 
6 import dil.Unicode;
7 import dil.String : slice;
8 import dil.Array;
9 import common;
10 
11 const char[3] LS = "\u2028"; /// Unicode line separator.
12 const dchar LSd = 0x2028;  /// ditto
13 const char[3] PS = "\u2029"; /// Unicode paragraph separator.
14 const dchar PSd = 0x2029;  /// ditto
15 static assert(LS[0] == PS[0] && LS[1] == PS[1]);
16 
17 const dchar _Z_ = 26; /// Control+Z.
18 
19 /// Casts a string to an integer at compile-time.
20 /// Allows for fast string comparison using integers:
21 /// *cast(uint*)"\xAA\xBB\xCC\xDD".ptr == castInt("\xAA\xBB\xCC\xDD")
22 static size_t castInt(cstring s)
23 {
24   assert(s.length <= size_t.sizeof);
25   size_t x;
26   foreach (i, c; s)
27     version(BigEndian)
28       x = (x << 8) | c; // Add c as LSByte.
29     else
30       x |= (c << i*8); // Add c as MSByte.
31   return x;
32 }
33 version(LittleEndian)
34 static assert(castInt("\xAA\xBB\xCC\xDD") == 0xDDCCBBAA &&
35   castInt("\xAB\xCD\xEF") == 0xEFCDAB && castInt("\xAB\xCD") == 0xCDAB);
36 else
37 static assert(castInt("\xAA\xBB\xCC\xDD") == 0xAABBCCDD &&
38   castInt("\xAB\xCD\xEF") == 0xABCDEF && castInt("\xAB\xCD") == 0xABCD);
39 
40 /// Returns: true if d is a Unicode line or paragraph separator.
41 bool isUnicodeNewlineChar(dchar d)
42 {
43   return d == LSd || d == PSd;
44 }
45 
46 /// Returns: true if p points to a line or paragraph separator.
47 bool isUnicodeNewline(cchar* p)
48 {
49   return *p == LS[0] && p[1] == LS[1] && (p[2] == LS[2] || p[2] == PS[2]);
50 }
51 
52 /// Returns: true if p points to the start of a Newline.
53 /// $(BNF
54 ////Newline := "\n" | "\r" | "\r\n" | LS | PS
55 ////LS := "\u2028"
56 ////PS := "\u2029"
57 ////)
58 bool isNewline(cchar* p)
59 {
60   return *p == '\n' || *p == '\r' || isUnicodeNewline(p);
61 }
62 
63 /// Returns: true if c is a Newline character.
64 bool isNewline(dchar c)
65 {
66   return c == '\n' || c == '\r' || isUnicodeNewlineChar(c);
67 }
68 
69 /// Returns: true if p points to an EOF character.
70 /// $(BNF
71 ////EOF := "\0" | _Z_
72 ////_Z_ := "\x1A"
73 ////)
74 bool isEOF(dchar c)
75 {
76   return c == 0 || c == _Z_;
77 }
78 
79 /// Returns: true if p points to the first character of an EndOfLine.
80 /// $(BNF EndOfLine := Newline | EOF)
81 bool isEndOfLine(cchar* p)
82 {
83   return isNewline(p) || isEOF(*p);
84 }
85 
86 /// Scans a Newline and sets p one character past it.
87 /// Returns: true if found or false otherwise.
88 bool scanNewline(ref cchar* p)
89 in { assert(p); }
90 body
91 {
92   switch (*p)
93   {
94   case '\r':
95     if (p[1] == '\n')
96       ++p;
97     goto case;
98   case '\n':
99     ++p;
100     break;
101   default:
102     if (isUnicodeNewline(p))
103       p += 3;
104     else
105       return false;
106   }
107   return true;
108 }
109 
110 /// Scans a Newline and sets p one character past it.
111 /// Returns: true if found or false otherwise.
112 bool scanNewline(ref cchar* p, cchar* end)
113 in { assert(p && p < end); }
114 body
115 {
116   switch (*p)
117   {
118   case '\r':
119     if (p+1 < end && p[1] == '\n')
120       ++p;
121     goto case;
122   case '\n':
123     ++p;
124     break;
125   default:
126     if (p+2 < end && isUnicodeNewline(p))
127       p += 3;
128     else
129       return false;
130   }
131   return true;
132 }
133 
134 /// Scans a Newline in reverse direction and sets end
135 /// on the first character of the newline.
136 /// Returns: true if found or false otherwise.
137 bool scanNewlineReverse(cchar* begin, ref cchar* end)
138 {
139   switch (*end)
140   {
141   case '\n':
142     if (begin <= end-1 && end[-1] == '\r')
143       end--;
144     goto case;
145   case '\r':
146     break;
147   case LS[2], PS[2]:
148     if (begin <= end-2 && end[-1] == LS[1] && end[-2] == LS[0]) {
149       end -= 2;
150       break;
151     }
152     goto default;
153   default:
154     return false;
155   }
156   return true;
157 }
158 
159 /// Scans a D identifier.
160 /// Params:
161 ///   ref_p = Where to start.
162 ///   end = Where it ends.
163 /// Returns: the identifier if valid (sets ref_p one past the id,) or
164 ///          null if invalid (leaves ref_p unchanged.)
165 cstring scanIdentifier(ref cchar* ref_p, cchar* end)
166 in { assert(ref_p && ref_p < end); }
167 body
168 {
169   auto p = ref_p;
170   if (isidbeg(*p) || scanUnicodeAlpha(p, end)) // IdStart
171   {
172     do // IdChar*
173       p++;
174     while (p < end && (isident(*p) || scanUnicodeAlpha(p, end)));
175     auto identifier = slice(ref_p, p);
176     ref_p = p;
177     return identifier;
178   }
179   return null;
180 }
181 
182 /// Returns true if p points to the start of a D identifier.
183 bool isIdentifierStart(cchar* p, cchar* end)
184 {
185   return isidbeg(*p) || isUnicodeAlpha(p, end);
186 }
187 
188 /// Returns s with non-printable characters escaped.
189 cstring escapeNonPrintable(cstring s)
190 {
191   char[16] buffer;
192   CharArray s2;
193   size_t i, prev;
194   while (i < s.length)
195   {
196     auto j = i; // Remember index of the current character.
197     auto c = decode(s, i);
198     if (i == j)
199       c = s[i++] | 1<<31; // Error decoding char: set special flag.
200     if (auto n = escapeNonPrintable(c, buffer.ptr))
201     {
202       if (!prev) // Reserve space when appending the first time.
203         s2.cap = s.length + n - (i-j);
204       s2 ~= s[prev..j]; // Previous unescaped string.
205       s2 ~= buffer[0..n]; // Escape sequence.
206       prev = i;
207     }
208   }
209   if (prev && prev != s.length)
210     s2 ~= s[prev..$];
211   return s2.ptr ? s2[] : s;
212 }
213 
214 /// Returns an escape sequence if c is not printable.
215 cstring escapeNonPrintable(dchar c)
216 {
217   char[16] buffer;
218   if (auto n = escapeNonPrintable(c, buffer.ptr))
219     return buffer[0..n].dup;
220   else
221     return encode(buffer.ptr, c).dup;
222 }
223 
224 /// Writes an escape sequence to p if c is not printable.
225 /// Returns the number of characters written.
226 size_t escapeNonPrintable(dchar c, char* p)
227 {
228   enum H = "0123456789ABCDEF"; // Hex numerals.
229   size_t n; // Number of bytes written.
230   if (isascii(c))
231   { // ASCII
232     switch (c)
233     {
234     case '\0': c = '0'; goto Lcommon;
235     case '\a': c = 'a'; goto Lcommon;
236     case '\b': c = 'b'; goto Lcommon;
237     case '\f': c = 'f'; goto Lcommon;
238     case '\n': c = 'n'; goto Lcommon;
239     case '\r': c = 'r'; goto Lcommon;
240     case '\t': c = 't'; goto Lcommon;
241     case '\v': c = 'v'; goto Lcommon;
242     Lcommon:
243       p[0..n=2] = ['\\', cast(char)c];
244       break;
245     default:
246       if (c < 0x20 || c == 0x7F) // Special non-printable characters.
247         goto LoneByte;
248     }
249   }
250   else
251   { // UNICODE
252     // TODO: write function isUniPrintable() similar to isUniAlpha().
253     if (0x80 >= c && c <= 0x9F) // C1 control character set.
254       p[0..n=6] = ['\\', 'u', '0', '0', H[c>>4], H[c & 0x0F]];
255     if (c == '\u2028' || c == '\u2029')
256       p[0..n=6] = ['\\', 'u', '2', '0', '2', H[c & 0x0F]];
257     else if (!isValidChar(c))
258     {
259       if (c & 1<<31) // Check for the flag that forces a \xYY encoding.
260         c &= 0xFF;
261       if (c <= 0xFF)
262       LoneByte:
263         p[0..n=4] = ['\\', 'x', H[c>>4], H[c & 0x0F]];
264       else if (c <= 0xFFFF)
265         p[0..n=8] = ['\\', 'x', H[c>>12], H[c>>8 & 0x0F],
266                      '\\', 'x', H[c>>4 & 0x0F], H[c & 0x0F]];
267       else
268         p[0..n=16] = ['\\', 'x', H[c>>28], H[c>>24 & 0x0F],
269                       '\\', 'x', H[c>>20 & 0x0F], H[c>>16 & 0x0F],
270                       '\\', 'x', H[c>>12 & 0x0F], H[c>>8 & 0x0F],
271                       '\\', 'x', H[c>>4 & 0x0F], H[c & 0x0F]];
272     }
273   }
274   return n;
275 }
276 
277 
278 /// ASCII character properties table.
279 static const int ptable[256] = [
280  0, 0, 0, 0, 0, 0, 0, 0, 0,32, 0,32,32, 0, 0, 0,
281  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
282 32, 0, 0x2200, 0, 0, 0, 0, 0x2700, 0, 0, 0, 0, 0, 0, 0, 0,
283  7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 0, 0, 0, 0, 0, 0x3f00,
284  0,12,12,12,12,12,12, 8, 8, 8, 8, 8, 8, 8, 8, 8,
285  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0x5c00, 0, 0,16,
286  0, 0x70c, 0x80c,12,12,12, 0xc0c, 8, 8, 8, 8, 8, 8, 8, 0xa08, 8,
287  8, 8, 0xd08, 8, 0x908, 8, 0xb08, 8, 8, 8, 8, 0, 0, 0, 0, 0,
288  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
289  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
290  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
291  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
292  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
293  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
294  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
295  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
296 ];
297 
298 /// Enumeration of character property flags.
299 enum CProperty
300 {
301        Octal = 1,    /// [0-7]
302        Digit = 1<<1, /// [0-9]
303          Hex = 1<<2, /// [0-9a-fA-F]
304        Alpha = 1<<3, /// [a-zA-Z]
305   Underscore = 1<<4, /// [_]
306   Whitespace = 1<<5  /// [ \t\v\f]
307 }
308 
309 const uint EVMask = 0xFF00; // Bit mask for escape value.
310 
311 private alias CP = CProperty;
312 /// Returns: true if c is an octal digit.
313 int isoctal(char c) { return ptable[c] & CP.Octal; }
314 /// Returns: true if c is a decimal digit.
315 int isdigit(char c) { return ptable[c] & CP.Digit; }
316 /// ditto
317 int isdigit(uint c) { return isdigit(cast(char)c); }
318 /// Returns: true if c is a decimal digit or '_'.
319 int isdigi_(char c) { return ptable[c] & (CP.Digit | CP.Underscore); }
320 /// Returns: true if c is a hexadecimal digit.
321 int ishexad(char c) { return ptable[c] & CP.Hex; }
322 /// ditto
323 int ishexad(uint c) { return ishexad(cast(char)c); }
324 /// Returns: true if c is a hexadecimal digit or '_'.
325 int ishexa_(char c) { return ptable[c] & (CP.Hex | CP.Underscore); }
326 /// Returns: true if c is a letter.
327 int isalpha(char c) { return ptable[c] & CP.Alpha; }
328 /// Returns: true if c is an alphanumeric.
329 int isalnum(char c) { return ptable[c] & (CP.Alpha | CP.Digit); }
330 /// Returns: true if c is the beginning of a D identifier (only ASCII.)
331 int isidbeg(char c) { return ptable[c] & (CP.Alpha | CP.Underscore); }
332 /// ditto
333 int isidbeg(dchar c) { return isidbeg(cast(char)c); }
334 /// ditto
335 int isidbeg(uint c) { return isidbeg(cast(char)c); }
336 /// Returns: true if c is a D identifier character (only ASCII.)
337 int isident(char c) { return ptable[c] & (CP.Alpha|CP.Underscore|CP.Digit); }
338 /// ditto
339 int isident(uint c) { return isident(cast(char)c); }
340 /// Returns: true if c is a whitespace character.
341 int isspace(char c) { return ptable[c] & CP.Whitespace; }
342 /// ditto
343 int isspace(uint c) { return isspace(cast(char)c); }
344 /// Returns: the escape value for c.
345 int char2ev(char c) { return ptable[c] >> 8; /*(ptable[c] & EVMask) >> 8;*/ }
346 /// Returns: true if c is an ASCII character.
347 int isascii(uint c) { return c < 128; }
348 
349 /// Returns true if the string is empty or has only whitespace characters.
350 bool isAllSpace(cchar* start, cchar* end)
351 {
352   for (; start < end; start++)
353     if (!isspace(*start))
354       return false;
355   return true;
356 }
357 
358 /// Converts c to its hexadecimal value. Returns false if c isn't a hex digit.
359 bool hex2val(Char)(ref Char c)
360 {
361   if (c - '0' < 10)
362     c -= '0';
363   else if ((c|0x20) - 'a' < 6) // 'A'|0x20 == 'a'
364     c = cast(Char)((c|0x20) - 'a' + 10);
365   else
366     return false;
367   return true;
368 }
369 
370 version(gen_ptable)
371 static this()
372 {
373   alias p = ptable;
374   assert(p.length == 256);
375   // Initialize character properties table.
376   for (size_t i; i < p.length; ++i)
377   {
378     p[i] = 0; // Reset
379     if ('0' <= i && i <= '7')
380       p[i] |= CP.Octal;
381     if ('0' <= i && i <= '9')
382       p[i] |= CP.Digit | CP.Hex;
383     if ('a' <= i && i <= 'f' || 'A' <= i && i <= 'F')
384       p[i] |= CP.Hex;
385     if ('a' <= i && i <= 'z' || 'A' <= i && i <= 'Z')
386       p[i] |= CP.Alpha;
387     if (i == '_')
388       p[i] |= CP.Underscore;
389     if (i == ' ' || i == '\t' || i == '\v' || i == '\f')
390       p[i] |= CP.Whitespace;
391   }
392   // Store escape sequence values in second byte.
393   assert(CProperty.max <= ubyte.max,
394     "character property flags and escape value byte overlap.");
395   p['\''] |= 39 << 8;
396   p['"'] |= 34 << 8;
397   p['?'] |= 63 << 8;
398   p['\\'] |= 92 << 8;
399   p['a'] |= 7 << 8;
400   p['b'] |= 8 << 8;
401   p['f'] |= 12 << 8;
402   p['n'] |= 10 << 8;
403   p['r'] |= 13 << 8;
404   p['t'] |= 9 << 8;
405   p['v'] |= 11 << 8;
406   // Print a formatted array literal.
407   char[] array = "[\n".dup;
408   foreach (i, c; ptable)
409   {
410     array ~= Format((c>255?" 0x{0:x},":"{0,2},"), c) ~ (((i+1) % 16) ? "":"\n");
411   }
412   array[$-2..$] = "\n]";
413   Stdout(array).newline;
414 }