1 /// Author: Aziz Köksal
2 /// License: GPL3
3 /// $(Maturity very high)
4 module dil.Converter;
5 
6 import dil.lexer.Funcs;
7 import dil.i18n.Messages;
8 import dil.Diagnostics,
9        dil.Location,
10        dil.Unicode,
11        dil.FileBOM;
12 import common;
13 
14 /// Converts various Unicode encoding formats to UTF-8.
15 struct Converter
16 {
17   cstring filePath; /// For error messages.
18   Diagnostics diag;
19 
20 static
21 {
22   /// Byte-swaps c.
23   dchar swapBytes(dchar c)
24   {
25     return c = (c << 24) |
26                (c >> 24) |
27               ((c >> 8) & 0xFF00) |
28               ((c << 8) & 0xFF0000);
29   }
30 
31   /// Byte-swaps c.
32   wchar swapBytes(wchar c)
33   {
34     return cast(wchar)(c << 8) | (c >> 8);
35   }
36 
37   /// Swaps the bytes of c on a little-endian machine.
38   dchar BEtoMachineDword(dchar c)
39   {
40     version(LittleEndian)
41       return swapBytes(c);
42     else
43       return c;
44   }
45 
46   /// Swaps the bytes of c on a big-endian machine.
47   dchar LEtoMachineDword(dchar c)
48   {
49     version(LittleEndian)
50       return c;
51     else
52       return swapBytes(c);
53   }
54 
55   /// Swaps the bytes of c on a little-endian machine.
56   wchar BEtoMachineWord(wchar c)
57   {
58     version(LittleEndian)
59       return swapBytes(c);
60     else
61       return c;
62   }
63 
64   /// Swaps the bytes of c on a big-endian machine.
65   wchar LEtoMachineWord(wchar c)
66   {
67     version(LittleEndian)
68       return c;
69     else
70       return swapBytes(c);
71   }
72 }
73 
74   /// Converts a UTF-32 text to UTF-8.
75   char[] UTF32toUTF8(bool isBigEndian)(const(ubyte)[] data)
76   {
77     if (data.length == 0)
78       return null;
79 
80     char[] result;
81     uint lineNum = 1;
82     // Used to clear first 2 bits to make len multiple of 4.
83     const bmask = ~cast(size_t)0b11;
84     auto text = cast(const(dchar)[]) data[0 .. $ & bmask];
85 
86     foreach (dchar c; text)
87     {
88       static if (isBigEndian)
89         c = BEtoMachineDword(c);
90       else
91         c = LEtoMachineDword(c);
92 
93       if (!isValidChar(c))
94       {
95         diag ~= new LexerError(
96           new Location(filePath, lineNum),
97           diag.formatMsg(MID.InvalidUTF32Character, c)
98         );
99         c = REPLACEMENT_CHAR;
100       }
101 
102       if (isNewline(c))
103         ++lineNum;
104       dil.Unicode.encode(result, c);
105     }
106 
107     if (data.length % 4)
108       diag ~= new LexerError(
109         new Location(filePath, lineNum),
110         diag.formatMsg(MID.UTF32FileMustBeDivisibleBy4)
111       );
112 
113     return result;
114   }
115 
116   alias UTF32BEtoUTF8 = UTF32toUTF8!(true); /// Instantiation for UTF-32 BE.
117   alias UTF32LEtoUTF8 = UTF32toUTF8!(false); /// Instantiation for UTF-32 LE.
118 
119   /// Converts a UTF-16 text to UTF-8.
120   char[] UTF16toUTF8(bool isBigEndian)(const(ubyte)[] data)
121   {
122     if (data.length == 0)
123       return null;
124 
125     // Used to clear first bit to make len multiple of 2.
126     const bmask = ~cast(size_t)0b1;
127     auto text = cast(const(wchar)[]) data[0 .. $ & bmask];
128     auto p = text.ptr;
129     auto end = p + text.length;
130     char[] result;
131     uint lineNum = 1;
132 
133     for (; p < end; p++)
134     {
135       dchar c = *p;
136       static if (isBigEndian)
137         c = BEtoMachineWord(cast(wchar)c);
138       else
139         c = LEtoMachineWord(cast(wchar)c);
140 
141       if (0xD800 > c || c > 0xDFFF)
142       {}
143       else if (c <= 0xDBFF && p+1 < end)
144       { // Decode surrogate pairs.
145         wchar c2 = p[1];
146         static if (isBigEndian)
147           c2 = BEtoMachineWord(c2);
148         else
149           c2 = LEtoMachineWord(c2);
150 
151         if (0xDC00 <= c2 && c2 <= 0xDFFF)
152         {
153           c = (c - 0xD7C0) << 10;
154           c |= (c2 & 0x3FF);
155           ++p;
156         }
157       }
158       else
159       {
160         diag ~= new LexerError(
161           new Location(filePath, lineNum),
162           diag.formatMsg(MID.InvalidUTF16Character, c)
163         );
164         c = REPLACEMENT_CHAR;
165       }
166 
167       if (isNewline(c))
168         ++lineNum;
169       dil.Unicode.encode(result, c);
170     }
171 
172     if (data.length % 2)
173       diag ~= new LexerError(
174         new Location(filePath, lineNum),
175         diag.formatMsg(MID.UTF16FileMustBeDivisibleBy2)
176       );
177     return result;
178   }
179 
180   alias UTF16BEtoUTF8 = UTF16toUTF8!(true); /// Instantiation for UTF-16 BE.
181   alias UTF16LEtoUTF8 = UTF16toUTF8!(false); /// Instantiation for UTF-16 LE.
182 
183   /// Converts the text in data to UTF-8.
184   /// Leaves data unchanged if it is in UTF-8 already.
185   char[] data2UTF8(ubyte[] data)
186   {
187     if (data.length == 0)
188       return null;
189 
190     char[] text;
191 
192     final switch (tellBOM(data))
193     {
194     case BOM.None:
195       // No BOM found. According to the specs the first character
196       // must be an ASCII character.
197       if (data.length >= 4)
198       {
199         if (data[0..3] == x"00 00 00")
200         {
201           text = UTF32BEtoUTF8(data); // UTF-32BE: 00 00 00 XX
202           break;
203         }
204         else if (data[1..4] == x"00 00 00")
205         {
206           text = UTF32LEtoUTF8(data); // UTF-32LE: XX 00 00 00
207           break;
208         }
209       }
210       if (data.length >= 2)
211       {
212         if (data[0] == 0) // UTF-16BE: 00 XX
213         {
214           text = UTF16BEtoUTF8(data);
215           break;
216         }
217         else if (data[1] == 0) // UTF-16LE: XX 00
218         {
219           text = UTF16LEtoUTF8(data);
220           break;
221         }
222       }
223       text = cast(char[])data; // UTF-8
224       break;
225     case BOM.UTF8:
226       text = cast(char[])data[3..$];
227       break;
228     case BOM.UTF16BE:
229       text = UTF16BEtoUTF8(data[2..$]);
230       break;
231     case BOM.UTF16LE:
232       text = UTF16LEtoUTF8(data[2..$]);
233       break;
234     case BOM.UTF32BE:
235       text = UTF32BEtoUTF8(data[4..$]);
236       break;
237     case BOM.UTF32LE:
238       text = UTF32LEtoUTF8(data[4..$]);
239       break;
240     }
241     return text;
242   }
243 }
244 
245 /// Replaces invalid UTF-8 sequences with U+FFFD (if there's enough space,)
246 /// and Newlines with '\n'.
247 /// Params:
248 ///   text = The string to be sanitized; no new memory is allocated.
249 char[] sanitizeText(char[] text)
250 {
251   if (!text.length)
252     return null;
253 
254   auto q = text.ptr; // Writer.
255   cchar* p = q; // Reader.
256   auto end = p + text.length;
257 
258   while (p < end)
259   {
260     assert(q <= p);
261 
262     if (isascii(*p))
263     {
264       if (scanNewline(p, end))
265         *q++ = '\n'; // Copy newlines as '\n'.
266       else
267         *q++ = *p++; // Copy the ASCII character and advance pointers.
268       continue;
269     }
270 
271     auto p2 = p; // Remember beginning of the UTF-8 sequence.
272     dchar c = decode(p, end);
273 
274     if (c == ERROR_CHAR)
275     { // Skip to next ASCII character or valid UTF-8 sequence.
276       while (++p < end && !isValidLead(*p))
277       {}
278       if (q+2 < p) // Copy replacement char if there is enough space.
279         q[0..3] = REPLACEMENT_STR;
280     }
281     else // Copy the valid UTF-8 sequence.
282       while (p2 < p) // p points to one past the last trail byte.
283         *q++ = *p2++; // Copy code units.
284   }
285   assert(p == end);
286   text.length = q - text.ptr;
287   return text;
288 }
289 
290 void testConverter()
291 {
292   scope msg = new UnittestMsg("Testing struct Converter.");
293 
294   struct Data2Text
295   {
296     cstring text;
297     cstring expected = "source";
298     @property ubyte[] data()
299     { return cast(ubyte[])text.dup; }
300   }
301 
302   static Data2Text[] map = [
303     // Without BOM
304     {"source"},
305     {"s\0o\0u\0r\0c\0e\0"},
306     {"\0s\0o\0u\0r\0c\0e"},
307     {"s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"},
308     {"\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"},
309     // With BOM
310     {"\xEF\xBB\xBFsource"},
311     {"\xFE\xFF\0s\0o\0u\0r\0c\0e"},
312     {"\xFF\xFEs\0o\0u\0r\0c\0e\0"},
313     {"\x00\x00\xFE\xFF\0\0\0s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e"},
314     {"\xFF\xFE\x00\x00s\0\0\0o\0\0\0u\0\0\0r\0\0\0c\0\0\0e\0\0\0"},
315   ];
316 
317   auto converter = Converter("", new Diagnostics());
318   foreach (i, pair; map)
319     assert(converter.data2UTF8(pair.data) == pair.expected,
320       Format("failed at item {}", i));
321 }