1 /// Author: Aziz Köksal 2 /// License: GPL3 3 /// $(Maturity very high) 4 module dil.Converter; 5 6 import dil.lexer.Funcs; 7 import dil.i18n.Messages; 8 import dil.Diagnostics, 9 dil.Location, 10 dil.Unicode, 11 dil.FileBOM; 12 import common; 13 14 /// Converts various Unicode encoding formats to UTF-8. 15 struct Converter 16 { 17 cstring filePath; /// For error messages. 18 Diagnostics diag; 19 20 static 21 { 22 /// Byte-swaps c. 23 dchar swapBytes(dchar c) 24 { 25 return c = (c << 24) | 26 (c >> 24) | 27 ((c >> 8) & 0xFF00) | 28 ((c << 8) & 0xFF0000); 29 } 30 31 /// Byte-swaps c. 32 wchar swapBytes(wchar c) 33 { 34 return cast(wchar)(c << 8) | (c >> 8); 35 } 36 37 /// Swaps the bytes of c on a little-endian machine. 38 dchar BEtoMachineDword(dchar c) 39 { 40 version(LittleEndian) 41 return swapBytes(c); 42 else 43 return c; 44 } 45 46 /// Swaps the bytes of c on a big-endian machine. 47 dchar LEtoMachineDword(dchar c) 48 { 49 version(LittleEndian) 50 return c; 51 else 52 return swapBytes(c); 53 } 54 55 /// Swaps the bytes of c on a little-endian machine. 56 wchar BEtoMachineWord(wchar c) 57 { 58 version(LittleEndian) 59 return swapBytes(c); 60 else 61 return c; 62 } 63 64 /// Swaps the bytes of c on a big-endian machine. 65 wchar LEtoMachineWord(wchar c) 66 { 67 version(LittleEndian) 68 return c; 69 else 70 return swapBytes(c); 71 } 72 } 73 74 /// Converts a UTF-32 text to UTF-8. 75 char[] UTF32toUTF8(bool isBigEndian)(const(ubyte)[] data) 76 { 77 if (data.length == 0) 78 return null; 79 80 char[] result; 81 uint lineNum = 1; 82 // Used to clear first 2 bits to make len multiple of 4. 83 const bmask = ~cast(size_t)0b11; 84 auto text = cast(const(dchar)[]) data[0 .. $ & bmask]; 85 86 foreach (dchar c; text) 87 { 88 static if (isBigEndian) 89 c = BEtoMachineDword(c); 90 else 91 c = LEtoMachineDword(c); 92 93 if (!isValidChar(c)) 94 { 95 diag ~= new LexerError( 96 new Location(filePath, lineNum), 97 diag.formatMsg(MID.InvalidUTF32Character, c) 98 ); 99 c = REPLACEMENT_CHAR; 100 } 101 102 if (isNewline(c)) 103 ++lineNum; 104 dil.Unicode.encode(result, c); 105 } 106 107 if (data.length % 4) 108 diag ~= new LexerError( 109 new Location(filePath, lineNum), 110 diag.formatMsg(MID.UTF32FileMustBeDivisibleBy4) 111 ); 112 113 return result; 114 } 115 116 alias UTF32BEtoUTF8 = UTF32toUTF8!(true); /// Instantiation for UTF-32 BE. 117 alias UTF32LEtoUTF8 = UTF32toUTF8!(false); /// Instantiation for UTF-32 LE. 118 119 /// Converts a UTF-16 text to UTF-8. 120 char[] UTF16toUTF8(bool isBigEndian)(const(ubyte)[] data) 121 { 122 if (data.length == 0) 123 return null; 124 125 // Used to clear first bit to make len multiple of 2. 126 const bmask = ~cast(size_t)0b1; 127 auto text = cast(const(wchar)[]) data[0 .. $ & bmask]; 128 auto p = text.ptr; 129 auto end = p + text.length; 130 char[] result; 131 uint lineNum = 1; 132 133 for (; p < end; p++) 134 { 135 dchar c = *p; 136 static if (isBigEndian) 137 c = BEtoMachineWord(cast(wchar)c); 138 else 139 c = LEtoMachineWord(cast(wchar)c); 140 141 if (0xD800 > c || c > 0xDFFF) 142 {} 143 else if (c <= 0xDBFF && p+1 < end) 144 { // Decode surrogate pairs. 145 wchar c2 = p[1]; 146 static if (isBigEndian) 147 c2 = BEtoMachineWord(c2); 148 else 149 c2 = LEtoMachineWord(c2); 150 151 if (0xDC00 <= c2 && c2 <= 0xDFFF) 152 { 153 c = (c - 0xD7C0) << 10; 154 c |= (c2 & 0x3FF); 155 ++p; 156 } 157 } 158 else 159 { 160 diag ~= new LexerError( 161 new Location(filePath, lineNum), 162 diag.formatMsg(MID.InvalidUTF16Character, c) 163 ); 164 c = REPLACEMENT_CHAR; 165 } 166 167 if (isNewline(c)) 168 ++lineNum; 169 dil.Unicode.encode(result, c); 170 } 171 172 if (data.length % 2) 173 diag ~= new LexerError( 174 new Location(filePath, lineNum), 175 diag.formatMsg(MID.UTF16FileMustBeDivisibleBy2) 176 ); 177 return result; 178 } 179 180 alias UTF16BEtoUTF8 = UTF16toUTF8!(true); /// Instantiation for UTF-16 BE. 181 alias UTF16LEtoUTF8 = UTF16toUTF8!(false); /// Instantiation for UTF-16 LE. 182 183 /// Converts the text in data to UTF-8. 184 /// Leaves data unchanged if it is in UTF-8 already. 185 char[] data2UTF8(ubyte[] data) 186 { 187 if (data.length == 0) 188 return null; 189 190 char[] text; 191 192 final switch (tellBOM(data)) 193 { 194 case BOM.None: 195 // No BOM found. According to the specs the first character 196 // must be an ASCII character. 197 if (data.length >= 4) 198 { 199 if (data[0..3] == x"00 00 00") 200 { 201 text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX 202 break; 203 } 204 else if (data[1..4] == x"00 00 00") 205 { 206 text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00 207 break; 208 } 209 } 210 if (data.length >= 2) 211 { 212 if (data[0] == 0) // UTF-16BE: 00 XX 213 { 214 text = UTF16BEtoUTF8(data); 215 break; 216 } 217 else if (data[1] == 0) // UTF-16LE: XX 00 218 { 219 text = UTF16LEtoUTF8(data); 220 break; 221 } 222 } 223 text = cast(char[])data; // UTF-8 224 break; 225 case BOM.UTF8: 226 text = cast(char[])data[3..$]; 227 break; 228 case BOM.UTF16BE: 229 text = UTF16BEtoUTF8(data[2..$]); 230 break; 231 case BOM.UTF16LE: 232 text = UTF16LEtoUTF8(data[2..$]); 233 break; 234 case BOM.UTF32BE: 235 text = UTF32BEtoUTF8(data[4..$]); 236 break; 237 case BOM.UTF32LE: 238 text = UTF32LEtoUTF8(data[4..$]); 239 break; 240 } 241 return text; 242 } 243 } 244 245 /// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,) 246 /// and Newlines with '\n'. 247 /// Params: 248 /// text = The string to be sanitized; no new memory is allocated. 249 char[] sanitizeText(char[] text) 250 { 251 if (!text.length) 252 return null; 253 254 auto q = text.ptr; // Writer. 255 cchar* p = q; // Reader. 256 auto end = p + text.length; 257 258 while (p < end) 259 { 260 assert(q <= p); 261 262 if (isascii(*p)) 263 { 264 if (scanNewline(p, end)) 265 *q++ = '\n'; // Copy newlines as '\n'. 266 else 267 *q++ = *p++; // Copy the ASCII character and advance pointers. 268 continue; 269 } 270 271 auto p2 = p; // Remember beginning of the UTF-8 sequence. 272 dchar c = decode(p, end); 273 274 if (c == ERROR_CHAR) 275 { // Skip to next ASCII character or valid UTF-8 sequence. 276 while (++p < end && !isValidLead(*p)) 277 {} 278 if (q+2 < p) // Copy replacement char if there is enough space. 279 q[0..3] = REPLACEMENT_STR; 280 } 281 else // Copy the valid UTF-8 sequence. 282 while (p2 < p) // p points to one past the last trail byte. 283 *q++ = *p2++; // Copy code units. 284 } 285 assert(p == end); 286 text.length = q - text.ptr; 287 return text; 288 } 289 290 void testConverter() 291 { 292 scope msg = new UnittestMsg("Testing struct Converter."); 293 294 struct Data2Text 295 { 296 cstring text; 297 cstring expected = "source"; 298 @property ubyte[] data() 299 { return cast(ubyte[])text.dup; } 300 } 301 302 static Data2Text[] map = [ 303 // Without BOM 304 {"source"}, 305 {"s\0o\0u\0r\0c\0e\0"}, 306 {"\0s\0o\0u\0r\0c\0e"}, 307 {"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"}, 308 {"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"}, 309 // With BOM 310 {"\xEF\xBB\xBFsource"}, 311 {"\xFE\xFF\0s\0o\0u\0r\0c\0e"}, 312 {"\xFF\xFEs\0o\0u\0r\0c\0e\0"}, 313 {"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"}, 314 {"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"}, 315 ]; 316 317 auto converter = Converter("", new Diagnostics()); 318 foreach (i, pair; map) 319 assert(converter.data2UTF8(pair.data) == pair.expected, 320 Format("failed at item {}", i)); 321 }