dil.lexer.TokenSerializer source code

1 /// Author: Aziz Köksal
2 /// License: GPL3
3 /// $(Maturity average)
4 module dil.lexer.TokenSerializer;
5 
6 import dil.lexer.Identifier,
7        dil.lexer.IdTable,
8        dil.lexer.Funcs,
9        dil.lexer.Token;
10 import dil.Location,
11        dil.String;
12 import dil.Float : Float;
13 import common;
14 
15 /// Serializes a linked list of tokens to a buffer.
16 /// $(BNF
17 //// FileFormat := Header IdArray Tokens
18 ////     Header := "DIL1.0TOKS\x1A\x04\n"
19 ////    IdArray := "Ids:" IdCount IdElement* "\n"
20 ////  IdElement := AbsOffset IdLength
21 ////     Tokens := "Toks:" TokenCount BodyLength (IdTok | OtherTok)+ "\n"
22 ////      IdTok := TOK RelOffset IdIndex
23 ////   OtherTok := TOK RelOffset TokenLength
24 ////    IdCount := 2B # Number of elements in IdArray (=Identifier*[].)
25 ////  AbsOffset := 4B # Absolute offset from the beginning of the source text.
26 ////   IdLength := 2B # The length of the identifier.
27 //// TokenCount := 4B # Number of tokens (including EOF.)
28 //// BodyLength := 4B # Total length of the token data.
29 ////        TOK := 1B # The token kind.
30 ////  RelOffset := 2B # Relative offset to previous token (=whitespace.)
31 ////    IdIndex := 2B # Index into IdArray.
32 ////TokenLength := 2B # Length of the token's text.
33 ////         1B := 1Byte
34 ////         2B := 2Bytes
35 ////         4B := 4Bytes
36 ////)
37 struct TokenSerializer
38 {
39 static:
40   immutable string HEADER = "DIL1.0TOKS\x1A\x04\n";
41 
42   ubyte[] serialize(Token[] tokens)
43   {
44     ubyte[] data; // TODO: Can be preallocated considering nr of tokens.
45 
46     void writeS(cstring x)
47     {
48       data ~= cast(const(ubyte)[])x;
49     }
50     void write1B(ubyte x)
51     {
52       data ~= x;
53     }
54     void write2B(ushort x)
55     {
56       data ~= (cast(ubyte*)&x)[0..2];
57     }
58     void write4B(uint x)
59     {
60       data ~= (cast(ubyte*)&x)[0..4];
61     }
62 
63     Token*[] idents; // List of all unique ids in this file.
64     size_t[hash_t] idtable; // Table of ids seen so far.
65                             // Maps the id hash to an index into idents.
66     auto first = tokens[0];
67     auto text_begin = first.ws ? first.ws : first.start;
68     auto prev_end = text_begin;
69 
70     foreach (ref t; tokens)
71     {
72       switch (t.kind)
73       {
74       case TOK.Identifier:
75         // Format: <1B:TOK><2B:OffsToStart><2B:IdentsIndex>
76         auto hash = hashOf(t.ident.str);
77         auto pindex = hash in idtable;
78         size_t id_index;
79         if (!pindex)
80           (id_index = idents.length),
81           (idents ~= &t),
82           (idtable[hash] = id_index);
83         else
84           id_index = *pindex;
85         // Write the bytes.
86         write1B(cast(ubyte)t.kind); // TOK
87         write2B(cast(ushort)(t.start - prev_end)); // OffsetToStart
88         write2B(cast(ushort)id_index); // IdentsIndex
89         break;
90       // case TOK.Newline:
91       //   break;
92       // case TOK.String:
93       //   break;
94       // case TOK.HashLine:
95       //   break;
96       default:
97         // Format: <1B:TOK><2B:OffsetToStart><2B:TokenLength>
98         write1B(cast(ubyte)t.kind); // TOK
99         write2B(cast(ushort)(t.start - prev_end)); // OffsetToStart
100         write2B(cast(ushort)(t.end - t.start)); // TokenLength
101       }
102       prev_end = t.end;
103     }
104 
105     ubyte[] data_body = data;
106     data = null;
107     // Write file header.
108     writeS(HEADER);
109     writeS("Ids:");
110     write2B(cast(ushort)idents.length);
111     foreach (id; idents)
112       write4B(cast(uint)(id.start - text_begin)),
113       write2B(cast(ushort)(id.end - id.start));
114     writeS("\n");
115     writeS("Toks:");
116     write4B(tokens.length);
117     write4B(cast(uint)data_body.length);
118     data ~= data_body;
119     writeS("\n");
120     return data;
121   }
122 
123   Token[] deserialize(ubyte[] data, cstring srcText, IdTable idtable,
124     bool delegate(Token* next_token) callback)
125   {
126     ubyte* p = data.ptr;
127     ubyte* end = data.ptr + data.length;
128 
129     // Define nested functions for reading data and advancing the pointer.
130     bool match(string x)
131     {
132       return p+x.length <= end &&
133         p[0..x.length] == cast(immutable(ubyte)[])x &&
134         ((p += x.length), 1);
135     }
136     bool read(out char[] x, uint len)
137     {
138       return p+len <= end && ((x = (cast(char*)p)[0..len]), (p += len), 1);
139     }
140     bool read2B(out uint x)
141     {
142       return p+1 < end && ((x = *cast(ushort*)p), (p += 2), 1);
143     }
144     bool read4B(out uint x)
145     {
146       return p+3 < end && ((x = *cast(uint*)p), (p += 4), 1);
147     }
148     Identifier* readID()
149     {
150       uint id_begin = void, id_len = void;
151       if (!read4B(id_begin) || !read2B(id_len) ||
152           id_begin+id_len > srcText.length) return null;
153       auto id_str = srcText[id_begin .. id_begin + id_len];
154       if (!IdTable.isIdentifierString(id_str)) return null;
155       return idtable.lookup(id_str);
156     }
157 
158     if (srcText.length == 0) return null;
159 
160     Token[] tokens;
161     Identifier*[] idents;
162 
163     if (!match(HEADER)) return null;
164 
165     if (!match("Ids:")) return null;
166 
167     uint id_count = void;
168     if (!read2B(id_count)) return null;
169     idents = new Identifier*[id_count];
170 
171     for (uint i; i < id_count; i++)
172       if (auto id = readID())
173         idents[i] = id;
174       else
175         return null;
176 
177     if (!match("\nToks:")) return null;
178 
179     uint token_count = void;
180     if (!read4B(token_count)) return null;
181 
182     uint body_length = void;
183     if (!read4B(body_length)) return null;
184     if (p + body_length + 1 != end) return null;
185     if (*(p + body_length) != '\n') return null; // Terminated with '\n'.
186 
187     // We can allocate the exact amount of tokens we need.
188     tokens = new Token[token_count+4]; // +4: see Lexer.scanAll().
189     Token* token = &tokens[3]; // First 3 are reserved.
190     auto prev_end = srcText.ptr;
191     auto src_end = srcText.ptr+srcText.length;
192 
193     // Main loop that reads and initializes the tokens.
194     while (p < end && token_count)
195     {
196       token.kind = cast(TOK)*p++;
197       if (token.kind >= TOK.MAX) return null;
198 
199       uint offs_start = void;
200       if (!read2B(offs_start)) return null;
201       if (offs_start)
202         token.ws = prev_end;
203       token.start = prev_end + offs_start;
204       if (token.start >= src_end) return null;
205 
206       uint token_len = void;
207       switch (token.kind)
208       {
209       case TOK.Identifier:
210         uint index = void;
211         if (!read2B(index) && index < idents.length) return null;
212         token.ident = idents[index];
213         token_len = cast(uint)token.ident.str.length;
214         break;
215       default:
216         if (!read2B(token_len)) return null;
217       }
218       // Set token.end.
219       token.end = prev_end = token.start + token_len;
220       if (prev_end > src_end) return null;
221       // Pass the token back to the client.
222       if (!callback(token))
223         return null;
224       // Advance the pointer to the next token in the array.
225       token++;
226       token_count--;
227     }
228     assert(token == tokens.ptr + tokens.length - 1, "token pointer not at end");
229     token--; // Go back to the last token.
230 
231     if (token.kind != TOK.EOF) // Last token must be EOF.
232       return null;
233 
234     return tokens;
235   }
236 }