1 /// Author: Aziz Köksal 2 /// License: GPL3 3 /// $(Maturity average) 4 module dil.lexer.TokenSerializer; 5 6 import dil.lexer.Identifier, 7 dil.lexer.IdTable, 8 dil.lexer.Funcs, 9 dil.lexer.Token; 10 import dil.Location, 11 dil.String; 12 import dil.Float : Float; 13 import common; 14 15 /// Serializes a linked list of tokens to a buffer. 16 /// $(BNF 17 //// FileFormat := Header IdArray Tokens 18 //// Header := "DIL1.0TOKS\x1A\x04\n" 19 //// IdArray := "Ids:" IdCount IdElement* "\n" 20 //// IdElement := AbsOffset IdLength 21 //// Tokens := "Toks:" TokenCount BodyLength (IdTok | OtherTok)+ "\n" 22 //// IdTok := TOK RelOffset IdIndex 23 //// OtherTok := TOK RelOffset TokenLength 24 //// IdCount := 2B # Number of elements in IdArray (=Identifier*[].) 25 //// AbsOffset := 4B # Absolute offset from the beginning of the source text. 26 //// IdLength := 2B # The length of the identifier. 27 //// TokenCount := 4B # Number of tokens (including EOF.) 28 //// BodyLength := 4B # Total length of the token data. 29 //// TOK := 1B # The token kind. 30 //// RelOffset := 2B # Relative offset to previous token (=whitespace.) 31 //// IdIndex := 2B # Index into IdArray. 32 ////TokenLength := 2B # Length of the token's text. 33 //// 1B := 1Byte 34 //// 2B := 2Bytes 35 //// 4B := 4Bytes 36 ////) 37 struct TokenSerializer 38 { 39 static: 40 immutable string HEADER = "DIL1.0TOKS\x1A\x04\n"; 41 42 ubyte[] serialize(Token[] tokens) 43 { 44 ubyte[] data; // TODO: Can be preallocated considering nr of tokens. 45 46 void writeS(cstring x) 47 { 48 data ~= cast(const(ubyte)[])x; 49 } 50 void write1B(ubyte x) 51 { 52 data ~= x; 53 } 54 void write2B(ushort x) 55 { 56 data ~= (cast(ubyte*)&x)[0..2]; 57 } 58 void write4B(uint x) 59 { 60 data ~= (cast(ubyte*)&x)[0..4]; 61 } 62 63 Token*[] idents; // List of all unique ids in this file. 64 size_t[hash_t] idtable; // Table of ids seen so far. 65 // Maps the id hash to an index into idents. 66 auto first = tokens[0]; 67 auto text_begin = first.ws ? first.ws : first.start; 68 auto prev_end = text_begin; 69 70 foreach (ref t; tokens) 71 { 72 switch (t.kind) 73 { 74 case TOK.Identifier: 75 // Format: <1B:TOK><2B:OffsToStart><2B:IdentsIndex> 76 auto hash = hashOf(t.ident.str); 77 auto pindex = hash in idtable; 78 size_t id_index; 79 if (!pindex) 80 (id_index = idents.length), 81 (idents ~= &t), 82 (idtable[hash] = id_index); 83 else 84 id_index = *pindex; 85 // Write the bytes. 86 write1B(cast(ubyte)t.kind); // TOK 87 write2B(cast(ushort)(t.start - prev_end)); // OffsetToStart 88 write2B(cast(ushort)id_index); // IdentsIndex 89 break; 90 // case TOK.Newline: 91 // break; 92 // case TOK.String: 93 // break; 94 // case TOK.HashLine: 95 // break; 96 default: 97 // Format: <1B:TOK><2B:OffsetToStart><2B:TokenLength> 98 write1B(cast(ubyte)t.kind); // TOK 99 write2B(cast(ushort)(t.start - prev_end)); // OffsetToStart 100 write2B(cast(ushort)(t.end - t.start)); // TokenLength 101 } 102 prev_end = t.end; 103 } 104 105 ubyte[] data_body = data; 106 data = null; 107 // Write file header. 108 writeS(HEADER); 109 writeS("Ids:"); 110 write2B(cast(ushort)idents.length); 111 foreach (id; idents) 112 write4B(cast(uint)(id.start - text_begin)), 113 write2B(cast(ushort)(id.end - id.start)); 114 writeS("\n"); 115 writeS("Toks:"); 116 write4B(tokens.length); 117 write4B(cast(uint)data_body.length); 118 data ~= data_body; 119 writeS("\n"); 120 return data; 121 } 122 123 Token[] deserialize(ubyte[] data, cstring srcText, IdTable idtable, 124 bool delegate(Token* next_token) callback) 125 { 126 ubyte* p = data.ptr; 127 ubyte* end = data.ptr + data.length; 128 129 // Define nested functions for reading data and advancing the pointer. 130 bool match(string x) 131 { 132 return p+x.length <= end && 133 p[0..x.length] == cast(immutable(ubyte)[])x && 134 ((p += x.length), 1); 135 } 136 bool read(out char[] x, uint len) 137 { 138 return p+len <= end && ((x = (cast(char*)p)[0..len]), (p += len), 1); 139 } 140 bool read2B(out uint x) 141 { 142 return p+1 < end && ((x = *cast(ushort*)p), (p += 2), 1); 143 } 144 bool read4B(out uint x) 145 { 146 return p+3 < end && ((x = *cast(uint*)p), (p += 4), 1); 147 } 148 Identifier* readID() 149 { 150 uint id_begin = void, id_len = void; 151 if (!read4B(id_begin) || !read2B(id_len) || 152 id_begin+id_len > srcText.length) return null; 153 auto id_str = srcText[id_begin .. id_begin + id_len]; 154 if (!IdTable.isIdentifierString(id_str)) return null; 155 return idtable.lookup(id_str); 156 } 157 158 if (srcText.length == 0) return null; 159 160 Token[] tokens; 161 Identifier*[] idents; 162 163 if (!match(HEADER)) return null; 164 165 if (!match("Ids:")) return null; 166 167 uint id_count = void; 168 if (!read2B(id_count)) return null; 169 idents = new Identifier*[id_count]; 170 171 for (uint i; i < id_count; i++) 172 if (auto id = readID()) 173 idents[i] = id; 174 else 175 return null; 176 177 if (!match("\nToks:")) return null; 178 179 uint token_count = void; 180 if (!read4B(token_count)) return null; 181 182 uint body_length = void; 183 if (!read4B(body_length)) return null; 184 if (p + body_length + 1 != end) return null; 185 if (*(p + body_length) != '\n') return null; // Terminated with '\n'. 186 187 // We can allocate the exact amount of tokens we need. 188 tokens = new Token[token_count+4]; // +4: see Lexer.scanAll(). 189 Token* token = &tokens[3]; // First 3 are reserved. 190 auto prev_end = srcText.ptr; 191 auto src_end = srcText.ptr+srcText.length; 192 193 // Main loop that reads and initializes the tokens. 194 while (p < end && token_count) 195 { 196 token.kind = cast(TOK)*p++; 197 if (token.kind >= TOK.MAX) return null; 198 199 uint offs_start = void; 200 if (!read2B(offs_start)) return null; 201 if (offs_start) 202 token.ws = prev_end; 203 token.start = prev_end + offs_start; 204 if (token.start >= src_end) return null; 205 206 uint token_len = void; 207 switch (token.kind) 208 { 209 case TOK.Identifier: 210 uint index = void; 211 if (!read2B(index) && index < idents.length) return null; 212 token.ident = idents[index]; 213 token_len = cast(uint)token.ident.str.length; 214 break; 215 default: 216 if (!read2B(token_len)) return null; 217 } 218 // Set token.end. 219 token.end = prev_end = token.start + token_len; 220 if (prev_end > src_end) return null; 221 // Pass the token back to the client. 222 if (!callback(token)) 223 return null; 224 // Advance the pointer to the next token in the array. 225 token++; 226 token_count--; 227 } 228 assert(token == tokens.ptr + tokens.length - 1, "token pointer not at end"); 229 token--; // Go back to the last token. 230 231 if (token.kind != TOK.EOF) // Last token must be EOF. 232 return null; 233 234 return tokens; 235 } 236 }