dil.Highlighter source code

1 /// Author: Aziz Köksal
2 /// License: GPL3
3 /// $(Maturity average)
4 module dil.Highlighter;
5 
6 import dil.ast.DefaultVisitor,
7        dil.ast.Node,
8        dil.ast.Declaration,
9        dil.ast.Statement,
10        dil.ast.Expression,
11        dil.ast.Types;
12 import dil.lexer.Lexer,
13        dil.lexer.Funcs;
14 import dil.parser.Parser;
15 import dil.semantic.Module;
16 import dil.Compilation;
17 import dil.SourceText;
18 import dil.String,
19        dil.Array;
20 import util.Path;
21 import common;
22 
23 /// A token and syntax highlighter.
24 class Highlighter
25 {
26   TagMap tags; /// Which tag map to use.
27   CharArray buffer; /// Buffer that receives the text.
28   CompilationContext cc; /// The compilation context.
29 
30   /// Constructs a TokenHighlighter object.
31   this(TagMap tags, CompilationContext cc)
32   {
33     this.tags = tags;
34     this.cc = cc;
35   }
36 
37   /// Empties the buffer and returns its contents.
38   char[] takeText()
39   {
40     return buffer.take();
41   }
42 
43   /// Writes arguments formatted to the buffer.
44   void printf(cstring format, ...)
45   {
46     buffer ~= Format(_arguments, _argptr, format);
47   }
48 
49   /// Writes s to the buffer.
50   void print(cstring s)
51   {
52     buffer ~= s;
53   }
54 
55   /// Writes c to the buffer.
56   void print(char c)
57   {
58     buffer ~= c;
59   }
60 
61   /// Highlights tokens in a string.
62   /// Returns: A string with the highlighted tokens.
63   cstring highlightTokens(cstring text, cstring filePath, out uint lines)
64   {
65     auto src = new SourceText(filePath, text);
66     auto lx = new Lexer(src, cc.tables.lxtables, cc.diag);
67     lx.scanAll();
68     lines = lx.lineNum;
69     highlightTokens(lx.tokenList);
70     return takeText();
71   }
72 
73   /// Highlights the tokens from begin to end (both included).
74   /// Returns: A string with the highlighted tokens.
75   /// Params:
76   ///   skipWS = Skips whitespace tokens (e.g. comments) if true.
77   void highlightTokens(Token[] tokens, bool skipWS = false)
78   {
79     // Traverse linked list and print tokens.
80     foreach (token; tokens)
81     {
82       if (skipWS && token.isWhitespace)
83         continue;
84       token.ws && print(token.wsChars); // Print preceding whitespace.
85       printToken(&token);
86     }
87   }
88 
89   /// ditto
90   void highlightTokens(ref CharArray buffer, Token[] tokens,
91     bool skipWS = false)
92   {
93     auto buffer_saved = this.buffer;
94     this.buffer = buffer;
95     highlightTokens(tokens, skipWS);
96     buffer = this.buffer; // Update callers instance.
97     this.buffer = buffer_saved;
98   }
99 
100   /// Highlights all tokens of a source file.
101   void highlightTokens(cstring filePath, bool opt_printLines)
102   {
103     auto src = new SourceText(filePath, true);
104     auto lx = new Lexer(src, cc.tables.lxtables, cc.diag);
105     lx.scanAll();
106 
107     printf(tags["DocHead"], Path(filePath).name());
108     if (lx.errors.length)
109     {
110       print(tags["CompBegin"]);
111       printErrors(lx);
112       print(tags["CompEnd"]);
113     }
114 
115     if (opt_printLines)
116     {
117       print(tags["LineNumberBegin"]);
118       printLines(lx.lineNum);
119       print(tags["LineNumberEnd"]);
120     }
121 
122     print(tags["SourceBegin"]);
123     // Traverse linked list and print tokens.
124     foreach (token; lx.tokenList) {
125       token.ws && print(token.wsChars); // Print preceding whitespace.
126       printToken(&token);
127     }
128     print(tags["SourceEnd"]);
129     print(tags["DocEnd"]);
130   }
131 
132   /// Highlights the syntax in a source file.
133   void highlightSyntax(cstring filePath, bool printHTML, bool opt_printLines)
134   {
135     auto modul = new Module(filePath, cc);
136     modul.parse();
137     highlightSyntax(modul, printHTML, opt_printLines);
138   }
139 
140   /// ditto
141   void highlightSyntax(Module modul, bool printHTML, bool opt_printLines)
142   {
143     auto parser = modul.parser;
144     auto lx = parser.lexer;
145     auto tokens = lx.tokenList;
146     auto tokenExList = new TokenExBuilder().build(modul.root, tokens);
147 
148     printf(tags["DocHead"], modul.getFQN());
149     if (lx.errors.length || parser.errors.length)
150     { // Output error messages.
151       print(tags["CompBegin"]);
152       printErrors(lx);
153       printErrors(parser);
154       print(tags["CompEnd"]);
155     }
156 
157     if (opt_printLines)
158     {
159       print(tags["LineNumberBegin"]);
160       printLines(lx.lineNum);
161       print(tags["LineNumberEnd"]);
162     }
163 
164     print(tags["SourceBegin"]);
165 
166     auto tagNodeBegin = tags["NodeBegin"];
167     auto tagNodeEnd = tags["NodeEnd"];
168 
169     // Iterate over list of tokens.
170     foreach (i, ref tokenEx; tokenExList)
171     {
172       auto token = &tokens[i];
173       token.ws && print(token.wsChars); // Print preceding whitespace.
174       if (token.isWhitespace) {
175         printToken(token);
176         continue;
177       }
178       // <node>
179       foreach (node; tokenEx.beginNodes)
180         printf(tagNodeBegin, tags.getTag(node.kind),
181           node.getShortClassName());
182       // Token text.
183       printToken(token);
184       // </node>
185       if (printHTML)
186         foreach_reverse (node; tokenEx.endNodes)
187           print(tagNodeEnd);
188       else
189         foreach_reverse (node; tokenEx.endNodes)
190           printf(tagNodeEnd, tags.getTag(node.kind));
191     }
192     print(tags["SourceEnd"]);
193     print(tags["DocEnd"]);
194   }
195 
196   void printErrors(Lexer lx)
197   {
198     foreach (e; lx.errors)
199       printf(tags["LexerError"], e.filePath,
200                    e.loc, e.col, xml_escape(e.getMsg));
201   }
202 
203   void printErrors(Parser parser)
204   {
205     foreach (e; parser.errors)
206       printf(tags["ParserError"], e.filePath,
207                    e.loc, e.col, xml_escape(e.getMsg));
208   }
209 
210   void printLines(uint lines)
211   {
212     auto lineNumberFormat = tags["LineNumber"];
213     for (auto lineNum = 1; lineNum <= lines; lineNum++)
214       printf(lineNumberFormat, lineNum);
215   }
216 
217   /// Prints a token to the stream 'print'.
218   void printToken(Token* token)
219   {
220     switch (token.kind)
221     {
222     case TOK.Identifier:
223       printf(tags.Identifier, token.text);
224       break;
225     case TOK.Comment:
226       cstring formatStr;
227       switch (token.start[1])
228       {
229       case '/': formatStr = tags.LineC; break;
230       case '*': formatStr = tags.BlockC; break;
231       case '+': formatStr = tags.NestedC; break;
232       default: assert(0);
233       }
234       printf(formatStr, xml_escape(token.text));
235       break;
236     case TOK.String:
237       cstring text = token.text;
238       assert(text.length);
239       if (text.length > 1 && text[0] == 'q' && text[1] == '{')
240       {
241       version(D2)
242       {
243         auto buffer_saved = this.buffer; // Save;
244         this.buffer = CharArray(text.length);
245         print("q{");
246         // Traverse and print inner tokens.
247         Token* last; // Remember last token.
248         for (auto t = token.strval.tokens; t.kind; t++)
249         {
250           t.ws && print(t.wsChars); // Print preceding whitespace.
251           printToken(t);
252           last = t;
253         }
254         if (last) // Print: Whitespace? "}" Postfix?
255           print(slice(last.end, token.end));
256         text = takeText();
257         this.buffer = buffer_saved; // Restore
258       }
259       }
260       else
261         text = (text[0] == '"') ?
262           scanEscapeSequences(text, tags.Escape) :
263           xml_escape(text);
264       printf(tags.String, text);
265       break;
266     case TOK.Character:
267       cstring text = token.text;
268       text = (text.length > 1 && text[1] == '\\') ?
269         scanEscapeSequences(text, tags.Escape) :
270         xml_escape(text);
271       printf(tags.Char, text);
272       break;
273     case TOK.Int32, TOK.Int64, TOK.UInt32, TOK.UInt64,
274          TOK.Float32, TOK.Float64, TOK.Float80,
275          TOK.IFloat32, TOK.IFloat64, TOK.IFloat80:
276       printf(tags.Number, token.text);
277       break;
278     case TOK.Shebang:
279       printf(tags.Shebang, xml_escape(token.text));
280       break;
281     case TOK.HashLine:
282       // The text to be inserted into formatStr.
283       char[] lineText;
284 
285       void printWS(cchar* start, cchar* end)
286       {
287         if (start != end) lineText ~= start[0 .. end - start];
288       }
289 
290       auto num = token.hlval.lineNum;
291       if (num is null) // Malformed #line
292         lineText = token.text.dup;
293       else
294       {
295         // Print whitespace between #line and number.
296         printWS(token.start, num.start); // Prints "#line" as well.
297         lineText ~= Format(tags.Number, num.text); // Print the number.
298 
299         if (auto filespec = token.hlval.filespec)
300         { // Print whitespace between number and filespec.
301           printWS(num.end, filespec.start);
302           lineText ~= Format(tags.Filespec, xml_escape(filespec.text));
303         }
304       }
305       // Finally print the whole token.
306       printf(tags.HLine, lineText);
307       break;
308     case TOK.Illegal:
309       printf(tags.Illegal, token.text);
310       break;
311     case TOK.Newline:
312       printf(tags.Newline, token.text);
313       break;
314     case TOK.EOF:
315       print(tags.EOF);
316       break;
317     default:
318       if (token.isKeyword())
319         printf(tags.Keyword, token.text);
320       else if (token.isSpecialToken)
321         printf(tags.SpecialToken, token.text);
322       else
323         print(tags[token.kind]);
324     }
325   }
326 
327   /// Highlights escape sequences inside a text. Also escapes XML characters.
328   /// Params:
329   ///   text = The text to search in.
330   ///   fmt  = The format string passed to the function Format().
331   static cstring scanEscapeSequences(cstring text, cstring fmt)
332   {
333     auto p = text.ptr, end = p + text.length;
334     auto prev = p; // Remembers the end of the previous escape sequence.
335     CharArray result;
336     cstring escape_str;
337 
338     while (p < end)
339     {
340       string xml_entity = void;
341       switch (*p)
342       {
343       case '\\': break; // Found beginning of an escape sequence.
344       // Code to escape XML chars:
345       case '<': xml_entity = "&lt;";  goto Lxml;
346       case '>': xml_entity = "&gt;";  goto Lxml;
347       case '&': xml_entity = "&amp;"; goto Lxml;
348       Lxml:
349         if (prev < p) result ~= slice(prev, p); // Append previous string.
350         result ~= xml_entity; // Append entity.
351         prev = ++p;
352         continue; // End of "XML" code.
353       default:
354         p++;
355         continue; // Nothing to escape. Continue.
356       }
357 
358       auto escape_str_begin = p;
359       assert(*p == '\\');
360       p++;
361       if (p >= end)
362         break;
363 
364       uint digits = void;
365       switch (*p)
366       {
367       case 'x':
368         digits = 2+1;
369       case_Unicode:
370         assert(digits == 2+1 || digits == 4+1 || digits == 8+1);
371         if (p+digits >= end)
372           p++; // Broken sequence. Only skip the letter.
373         else // +1 was added everywhere else, so that the digits are skipped.
374           p += digits;
375         break;
376       case 'u': digits = 4+1; goto case_Unicode;
377       case 'U': digits = 8+1; goto case_Unicode;
378       default:
379         if (char2ev(*p)) // Table lookup.
380           p++;
381         else if (isoctal(*p))
382         {
383           if (++p < end && isoctal(*p))
384             if (++p < end && isoctal(*p))
385               p++;
386         }
387         else if (*p == '&')
388         { // Skip to ";". Assume valid sequence.
389           auto entity_name_begin = p+1;
390           while (++p < end && isalnum(*p))
391           {}
392           if (p < end && *p == ';')
393             p++; // Skip ';'.
394           escape_str = "\\&amp;" ~ slice(entity_name_begin, p);
395           goto Lescape_str_assigned;
396         }
397         // else
398           // continue; // Broken escape sequence.
399       }
400 
401       escape_str = slice(escape_str_begin, p);
402     Lescape_str_assigned:
403       if (prev < p) // Append previous string.
404         result ~= slice(prev, escape_str_begin);
405       result ~= Format(fmt, escape_str); // Finally format the escape sequence.
406       prev = p; // Update prev pointer.
407     }
408     assert(p <= end && prev <= end);
409 
410     if (prev is text.ptr)
411       return text; // Nothing escaped. Return original, unchanged text.
412     if (prev < end)
413       result ~= slice(prev, end);
414     return result[];
415   }
416 }
417 
418 /// Escapes '<', '>' and '&' with named HTML entities.
419 /// Returns: The escaped text, or the original if no entities were found.
420 cstring xml_escape(cstring text)
421 {
422   auto p = text.ptr, end = p + text.length;
423   auto prev = p; // Points to the end of the previous escape char.
424   string entity; // Current entity to be appended.
425   CharArray result;
426   while (p < end)
427     switch (*p)
428     {
429     case '<': entity = "&lt;";  goto Lcommon;
430     case '>': entity = "&gt;";  goto Lcommon;
431     case '&': entity = "&amp;"; goto Lcommon;
432     Lcommon:
433       if (!result.ptr)
434         result.cap = text.length;
435       prev != p && (result ~= slice(prev, p)); // Append previous string.
436       result ~= entity; // Append entity.
437       p++; // Skip '<', '>' or '&'.
438       prev = p;
439       break;
440     default:
441       p++;
442     }
443   if (prev is text.ptr)
444     return text; // Nothing escaped. Return original, unchanged text.
445   if (prev < end)
446     result ~= slice(prev, end);
447   return result[];
448 }
449 
450 /// Maps tokens to (format) strings.
451 class TagMap
452 {
453   cstring[hash_t] table;
454   cstring[TOK.MAX] tokenTable;
455 
456   this(cstring[hash_t] table)
457   {
458     this.table = table;
459     Identifier   = this["Identifier", "{0}"];
460     String       = this["String", "{0}"];
461     Char         = this["Char", "{0}"];
462     Number       = this["Number", "{0}"];
463     Keyword      = this["Keyword", "{0}"];
464     LineC        = this["LineC", "{0}"];
465     BlockC       = this["BlockC", "{0}"];
466     NestedC      = this["NestedC", "{0}"];
467     Escape       = this["Escape", "{0}"];
468     Shebang      = this["Shebang", "{0}"];
469     HLine        = this["HLine", "{0}"];
470     Filespec     = this["Filespec", "{0}"];
471     Illegal      = this["Illegal", "{0}"];
472     Newline      = this["Newline", "{0}"];
473     SpecialToken = this["SpecialToken", "{0}"];
474     Declaration  = this["Declaration", "d"];
475     Statement    = this["Statement", "s"];
476     Expression   = this["Expression", "e"];
477     Type         = this["Type", "t"];
478     Other        = this["Other", "o"];
479     EOF          = this["EOF", ""];
480 
481     foreach (i, tokStr; tokToString)
482       if (auto pStr = hashOf(tokStr) in this.table)
483         tokenTable[i] = *pStr;
484   }
485 
486   /// Returns the value for str, or 'fallback' if str is not in the table.
487   cstring opIndex(cstring str, cstring fallback = "")
488   {
489     if (auto p = hashOf(str) in table)
490       return *p;
491     return fallback;
492   }
493 
494   /// Returns the value for tok in O(1) time.
495   cstring opIndex(TOK tok)
496   {
497     return tokenTable[tok];
498   }
499 
500   /// Assigns str to tokenTable[tok].
501   void opIndexAssign(cstring str, TOK tok)
502   {
503     tokenTable[tok] = str;
504   }
505 
506   /// Shortcuts for quick access.
507   cstring Identifier, String, Char, Number, Keyword, LineC, BlockC, Escape,
508          NestedC, Shebang, HLine, Filespec, Illegal, Newline, SpecialToken,
509          Declaration, Statement, Expression, Type, Other, EOF;
510 
511   /// Returns the tag for the category 'k'.
512   cstring getTag(NodeKind k)
513   {
514     cstring tag;
515     if (k.isDeclaration)
516       tag = Declaration;
517     else if (k.isStatement)
518       tag = Statement;
519     else if (k.isExpression)
520       tag = Expression;
521     else if (k.isType)
522       tag = Type;
523     else if (k.isParameter)
524       tag = Other;
525     return tag;
526   }
527 }
528 
529 /// Returns the short class name of a class descending from Node.$(BR)
530 /// E.g.: dil.ast.Declarations.ClassDecl -> Class
531 string getShortClassName(Node node)
532 {
533   static string[] name_table;
534   if (name_table is null)
535     name_table = new string[NodeKind.max+1]; // Create a new table.
536   // Look up in table.
537   auto pname = &name_table[node.kind];
538   if (!pname.ptr)
539   { // Get fully qualified name of the class and extract just the name.
540     auto name = IString(typeid(node).name).rpartition('.')[1];
541     // Decl, Stmt, Expr, Type have length 4.
542     size_t suffixLength = node.isParameter ? 0 : 4;
543     // Remove common suffix and store.
544     *pname = name[0..Neg(suffixLength)][];
545   }
546   return *pname;
547 }
548 
549 /// Extended token structure.
550 struct TokenEx
551 {
552   //Token* token; /// The lexer token.
553   Node[] beginNodes; /// beginNodes[n].begin == token
554   Node[] endNodes; /// endNodes[n].end == token
555 }
556 
557 /// Builds an array of TokenEx items.
558 class TokenExBuilder : DefaultVisitor
559 {
560   TokenEx[] tokenExs; /// Extended tokens.
561   Token[] tokens; /// Original tokens.
562 
563   TokenEx[] build(Node root, Token[] tokens)
564   { // Creat the exact number of TokenEx instances.
565     this.tokens = tokens;
566     tokenExs = new TokenEx[tokens.length];
567     super.visitN(root);
568     return tokenExs;
569   }
570 
571   TokenEx* getTokenEx(Token* t)
572   {
573     assert(tokens.ptr <= t && t < tokens.ptr+tokens.length);
574     return &tokenExs[t - tokens.ptr];
575   }
576 
577   /// Override dispatch function.
578   override Node dispatch(Node n)
579   {
580     assert(n && n.begin && n.end);
581     getTokenEx(n.begin).beginNodes ~= n;
582     getTokenEx(n.end).endNodes ~= n;
583     return super.dispatch(n);
584   }
585 }