dil.lexer.Token source code

1 /// Author: Aziz Köksal
2 /// License: GPL3
3 /// $(Maturity high)
4 module dil.lexer.Token;
5 
6 import dil.lexer.Identifier,
7        dil.lexer.Funcs;
8 import dil.Location;
9 import dil.Float;
10 import dil.Array;
11 import common;
12 
13 public import dil.lexer.TokensEnum;
14 
15 /// A Token is a sequence of characters recognized by the lexical analyzer.
16 ///
17 /// Example:
18 /// $(PRE ‘    StringValue’
19 //// ^$(Token ws, ws) ^$(Token start, start)     ^$(Token end, end)
20 ///
21 ///$(Token kind, kind)  = TOK.Identifier
22 ///$(Token flags, flags) = Flags.None
23 ///$(Token union.ident, ident) = $(Identifier)("StringValue", kind))
24 /// Macros:
25 ///   Token = $(SYMLINK Token.$1, $2)
26 ///   Identifier = $(SYMLINK2 dil.lexer.Identifier, Identifier)
27 struct Token
28 {
29   TOK kind;     /// The token kind.
30   cchar* ws;    /// Points to the preceding whitespace characters if present.
31   cchar* start; /// Points to the first character of the token.
32   cchar* end;   /// Points one character past the end of the token.
33 
34   /// Represents the string value of a single string literal,
35   /// where possible escape sequences have been converted to their values.
36   struct StringValue
37   {
38     cbinstr str;    /// The typeless string value.
39     char pf = 0;    /// Postfix: 'c', 'w', 'd'. '\0' for none.
40     version(D2)
41     Token* tokens; /// Points to the contents of a token string stored
42                    /// as a zero-terminated array.
43   }
44 
45   /// Represents the long/ulong value of a number literal.
46   union IntegerValue
47   {
48     long  long_;  /// A long integer value.
49     ulong ulong_; /// An unsigned long integer value.
50   }
51 
52   /// Represents the data of a newline token.
53   struct NewlineValue
54   {
55     size_t lineNum; /// The line number in the source text.
56     HashLineInfo* hlinfo; /// Info from a "#line" token.
57   }
58 
59   /// Represents the value of a "#line Number Filespec?" token.
60   struct HashLineValue
61   {
62     Token* lineNum; /// The Number.
63     Token* filespec; /// The optional Filespec.
64   }
65 
66   /// Represents the info of a #line token. Used for error messages.
67   struct HashLineInfo
68   {
69     size_t lineNum; /// Delta line number calculated from #line Number.
70     cstring path;   /// File path set by #line num Filespec.
71     /// Calculates and returns the line number.
72     size_t getLineNum(size_t realnum)
73     {
74       return realnum - lineNum;
75     }
76     /// Calculates a delta value and sets 'lineNum'.
77     void setLineNum(size_t realnum, size_t hlnum)
78     {
79       lineNum = realnum - hlnum + 1;
80     }
81   }
82 
83   /// Data associated with this token.
84   union /+TokenValue+/
85   {
86     NewlineValue* nlval; /// Value of a newline token.
87     HashLineValue* hlval; /// Value of a #line token.
88     StringValue* strval; /// The value of a string token.
89     Identifier* ident; /// For keywords and identifiers.
90     dchar  dchar_; /// Value of a character literal.
91     size_t sizet_; /// An integer that fits into the address space.
92     int    int_; /// Value of an Int32 token.
93     uint   uint_; /// Value of a UInt32 token.
94     version(X86_64)
95     IntegerValue intval; /// Value of a number literal.
96     else
97     IntegerValue* intval; /// Value of a number literal.
98     Float mpfloat; /// A multiple precision float value.
99     void* pvoid; /// Associate arbitrary data with this token.
100   }
101 //   static assert(TokenValue.sizeof == (void*).sizeof);
102 
103   /// Returns the text of the token.
104   cstring text()
105   {
106     assert(start <= end);
107     return start[0 .. end - start];
108   }
109 
110   /// Sets the text of the token.
111   void text(cstring s)
112   {
113     start = s.ptr;
114     end = s.ptr + s.length;
115   }
116 
117   /// Returns the preceding whitespace of the token.
118   cstring wsChars()
119   {
120     assert(ws && start);
121     return ws[0 .. start - ws];
122   }
123 
124   /// Returns the next token.
125   Token* next()
126   {
127     assert(kind != TOK.Invalid);
128     return &this + 1;
129   }
130 
131   /// Returns the previous token.
132   Token* prev()
133   {
134     assert(kind != TOK.Invalid);
135     return &this - 1;
136   }
137 
138   /// Finds the next non-whitespace token. Does not go past TOK.EOF.
139   Token* nextNWS()
140   {
141     assert(kind != TOK.Invalid);
142     auto token = &this;
143     if (kind != TOK.EOF)
144       while ((++token).isWhitespace)
145       {}
146     return token;
147   }
148 
149   /// Finds the previous non-whitespace token. Does not go past TOK.HEAD.
150   Token* prevNWS()
151   {
152     assert(kind != TOK.Invalid);
153     auto token = &this;
154     if (kind != TOK.HEAD)
155       while ((--token).isWhitespace)
156       {}
157     return token;
158   }
159 
160   /// Returns the text of this token.
161   cstring toString()
162   {
163     return text();
164   }
165 
166   /// Returns true if this is a token that can have newlines in it.
167   ///
168   /// These can be block and nested comments and any string literal
169   /// except for escape string literals.
170   bool isMultiline()
171   {
172     return kind == TOK.String && start[0] != '\\' ||
173            kind == TOK.Comment && start[1] != '/';
174   }
175 
176   /// Returns true if this is a keyword token.
177   bool isKeyword()
178   {
179     return KeywordsBegin <= kind && kind <= KeywordsEnd;
180   }
181 
182   /// Returns true if this is an integral type token.
183   bool isIntegralType()
184   {
185     return IntegralTypeBegin <= kind && kind <= IntegralTypeEnd;
186   }
187 
188   /// Returns true if this is a whitespace token.
189   bool isWhitespace()
190   { // Tokens from TOK.init to TOK.LastWhitespace are whitespace.
191     return kind <= TOK.LastWhitespace;
192   }
193 
194   /// Returns true if this is a special token.
195   bool isSpecialToken()
196   {
197     return kind == TOK.SpecialID;
198   }
199 
200 version(D2)
201 {
202   /// Returns true if this is a token string literal.
203   bool isTokenStringLiteral()
204   { // strval.tok_str !is null
205     return kind == TOK.String && *start == 'q' && start[1] == '{';
206   }
207 }
208 
209   /// Returns true if this token starts a DeclarationDefinition.
210   bool isDeclDefStart()
211   {
212     return isDeclDefStartToken(kind);
213   }
214 
215   /// Returns true if this token starts a Statement.
216   bool isStatementStart()
217   {
218     return isStatementStartToken(kind);
219   }
220 
221   /// Returns true if this token starts an AsmStatement.
222   bool isAsmStatementStart()
223   {
224     return isAsmStatementStartToken(kind);
225   }
226 
227   /// Compares a token's kind to kind2.
228   int opEquals(TOK kind2)
229   {
230     return kind == kind2;
231   }
232 
233   /// Compares the position of two tokens.
234   /// Assumes they are from the same source text.
235   int opCmp(Token* rhs)
236   { // Returns: (lower, equal, greater) = (-1, 0, 1)
237     return start < rhs.start ? -1 : start !is rhs.start;
238   }
239 
240   /// Returns the Location of this token.
241   Location getLocation(bool realLocation)(cstring filePath)
242   {
243     auto search_t = &this;
244     // Find previous newline token.
245     while ((--search_t).kind != TOK.Newline)
246     {}
247     auto newline = search_t.nlval;
248     auto lineNum = newline.lineNum;
249     static if (!realLocation)
250       if (auto hlinfo = newline.hlinfo)
251       { // Change file path and line number.
252         filePath = hlinfo.path;
253         lineNum  = hlinfo.getLineNum(newline.lineNum);
254       }
255     auto lineBegin = search_t.end;
256     // Determine actual line begin and line number.
257     while (++search_t < &this)
258       // Multiline tokens must be rescanned for newlines.
259       if (search_t.isMultiline)
260         for (auto p = search_t.start, end = search_t.end; p < end;)
261           if (scanNewline(p))
262             ++lineNum,
263             lineBegin = p;
264           else
265             ++p;
266     return new Location(filePath, lineNum, lineBegin, this.start);
267   }
268 
269   alias getRealLocation = getLocation!(true);
270   alias getErrorLocation = getLocation!(false);
271 
272   /// Returns the location of the character past the end of this token.
273   Location errorLocationOfEnd(cstring filePath)
274   {
275     auto loc = getErrorLocation(filePath);
276     loc.to = end;
277     if (isMultiline) // Mutliline tokens may have newlines.
278       for (auto p = start, end_ = end; p < end_;)
279         if (scanNewline(p))
280           loc.lineBegin = p;
281         else
282           ++p;
283     return loc;
284   }
285 
286   /// Counts the newlines in this token.
287   uint lineCount()
288   {
289     uint count;
290     if (this.isMultiline)
291       for (auto p = start, end_ = end; p < end_;)
292         if (scanNewline(p, end_))
293           count++;
294         else
295           p++;
296     return count;
297   }
298 
299   /// Return the source text enclosed by the left and right token.
300   static cstring textSpan(Token* left, Token* right)
301   {
302     assert(left.end <= right.start || left is right );
303     return left.start[0 .. right.end - left.start];
304   }
305 
306   /// ditto
307   cstring textSpan(Token* right)
308   {
309     return textSpan(&this, right);
310   }
311 }
312 
313 alias TokenArray = DArray!Token;
314 
315 /// Returns true if this token starts a DeclarationDefinition.
316 bool isDeclDefStartToken(TOK tok)
317 {
318   switch (tok)
319   {
320   alias T = TOK;
321   case  T.Align, T.Pragma, T.Export, T.Private, T.Package, T.Protected,
322         T.Public, T.Extern, T.Deprecated, T.Override, T.Abstract,
323         T.Synchronized, T.Static, T.Final, T.Const,
324         T.Auto, T.Scope, T.Alias, T.Typedef, T.Import, T.Enum, T.Class,
325         T.Interface, T.Struct, T.Union, T.This, T.Tilde, T.Unittest, T.Debug,
326         T.Version, T.Template, T.New, T.Delete, T.Mixin, T.Semicolon,
327         T.Identifier, T.Dot, T.Typeof:
328     return true;
329   version(D2)
330   {
331   case T.Immutable, T.Pure, T.Shared, T.Gshared,
332        T.Ref, T.Nothrow, T.At:
333     return true;
334   }
335   default:
336     if (IntegralTypeBegin <= tok && tok <= IntegralTypeEnd)
337       return true;
338   }
339   return false;
340 }
341 
342 /// Returns true if this token starts a Statement.
343 bool isStatementStartToken(TOK tok)
344 {
345   switch (tok)
346   {
347   alias T = TOK;
348   case  T.Align, T.Extern, T.Final, T.Const, T.Auto, T.Identifier, T.Dot,
349         T.Typeof, T.If, T.While, T.Do, T.For, T.Foreach, T.ForeachReverse,
350         T.Switch, T.Case, T.Default, T.Continue, T.Break, T.Return, T.Goto,
351         T.With, T.Synchronized, T.Try, T.Throw, T.Scope, T.Volatile, T.Asm,
352         T.Pragma, T.Mixin, T.Static, T.Debug, T.Version, T.Alias, T.Semicolon,
353         T.Enum, T.Class, T.Interface, T.Struct, T.Union, T.LBrace, T.Typedef,
354         T.This, T.Super, T.Null, T.True, T.False, T.Int32, T.Int64, T.UInt32,
355         T.UInt64, T.Float32, T.Float64, T.Float80, T.IFloat32,
356         T.IFloat64, T.IFloat80, T.Character, T.String, T.LBracket,
357         T.Function, T.Delegate, T.Assert, T.Import, T.Typeid, T.Is, T.LParen,
358         T.Amp, T.Plus2, T.Minus2, T.Star,
359         T.Minus, T.Plus, T.Exclaim, T.Tilde, T.New, T.Delete, T.Cast:
360     return true;
361   version(D2)
362   {
363   case T.Traits, T.Immutable, T.Pure, T.Shared, T.Gshared,
364        T.Ref, T.Nothrow, T.At:
365     return true;
366   }
367   default:
368     if (IntegralTypeBegin <= tok && tok <= IntegralTypeEnd ||
369         tok == T.SpecialID)
370       return true;
371   }
372   return false;
373 }
374 
375 /// Returns true if this token starts an AsmStatement.
376 bool isAsmStatementStartToken(TOK tok)
377 {
378   switch (tok)
379   {
380   alias T = TOK;
381   // TODO: need to add all opcodes.
382   case T.In, T.Int, T.Out, T.Identifier, T.Align, T.Semicolon:
383     return true;
384   default:
385   }
386   return false;
387 }
388 
389 /// A list of tokens that point to tokToString[kind] as their text.
390 static Token[TOK.MAX] staticTokens;
391 
392 /// Returns the token corresponding to a token kind.
393 Token* toToken(TOK kind)
394 {
395   return &staticTokens[kind];
396 }
397 
398 /// Initializes staticTokens.
399 static this()
400 {
401   import dil.lexer.IDs;
402 
403   foreach (i, ref t; staticTokens)
404   {
405     auto kind = cast(TOK)i;
406     auto text = kind.toString();
407     t.kind = kind;
408     t.start = text.ptr;
409     t.end = text.ptr + text.length;
410   }
411 
412   /// Set the ident member of the keyword tokens and the one Identifier token.
413   foreach (ref kw; IDs.getKeywordIDs())
414     kw.kind.toToken().ident = &kw;
415   TOK.Identifier.toToken().ident = &IDs.Identifier_;
416   TOK.SpecialID.toToken().ident = &IDs.SpecialID;
417 }