1 /// Author: Aziz Köksal
2 /// License: GPL3
3 /// $(Maturity very high)
4 module dil.Unicode;
5 
6 import common;
7 
8 public import util.uni : isUniAlpha;
9 
10 /// U+FFFD = �. Used to replace invalid Unicode characters.
11 enum cdchar REPLACEMENT_CHAR = '\uFFFD';
12 enum cchar[3] REPLACEMENT_STR = "\uFFFD"; /// Ditto
13 /// Invalid character, returned on errors.
14 enum dchar ERROR_CHAR = 0xD800;
15 
16 /// Returns: true if this character is not a surrogate
17 /// code point and not higher than 0x10FFFF.
18 bool isValidChar(dchar d)
19 {
20   return d < 0xD800 || (d > 0xDFFF && d <= 0x10FFFF);
21 }
22 
23 /// There are a total of 66 noncharacters.
24 /// Returns: true if this is one of them.
25 /// See_also: Chapter 16.7 Noncharacters in Unicode 5.0
26 bool isNoncharacter(dchar d)
27 {
28   return 0xFDD0 <= d && d <= 0xFDEF || // 32
29          d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34
30 }
31 
32 /// Returns: true if this is a trail byte of a UTF-8 sequence.
33 bool isTrailByte(ubyte b)
34 {
35   return (b & 0xC0) == 0x80; // 10xx_xxxx
36 }
37 
38 /// Returns: true if this is a lead byte of a UTF-8 sequence.
39 bool isLeadByte(ubyte b)
40 {
41   return (b & 0xC0) == 0xC0; // 11xx_xxxx
42 }
43 
44 // FIXME: isValidLead() should return true for ASCII as well.
45 /// Returns: true if c is a valid lead character.
46 bool isValidLead(char c)
47 { // NB: not all overlong sequences are checked.
48   return (c & 0xC0) == 0xC0 && (c & 0xFE) != 0xC0;
49 }
50 
51 /// ditto
52 bool isValidLead(wchar c)
53 {
54   return c <= 0xDBFF || c > 0xDFFF;
55 }
56 
57 /// ditto
58 bool isValidLead(dchar c)
59 {
60   return isValidChar(c);
61 }
62 
63 /// Enumeration of errors related to decoding UTF-8 sequences.
64 enum UTF8Error
65 {
66   Invalid,    /// The correctly decoded character is invalid.
67   Overlong,   /// Overlong sequence (must be encoded with fewer bytes.)
68   TrailByte,  /// Missing trail byte.
69   Over4Bytes, /// The sequence is longer than 4 bytes.
70 }
71 /// Enumeration of errors related to decoding UTF-16 sequences.
72 enum UTF16Error
73 {
74   Invalid,     /// The correctly decoded character is invalid.
75   LoSurrogate, /// Missing low surrogate wchar.
76   HiSurrogate, /// Missing high surrogate wchar.
77 }
78 
79 /// Returns the precise error in a UTF-8 sequence.
80 UTF8Error utf8Error(cstring s, ref size_t i)
81 {
82   auto p = s.ptr + i;
83   auto e = utf8Error(p, s.ptr + s.length);
84   i = p - s.ptr;
85   return e;
86 }
87 /// ditto
88 UTF8Error utf8Error(ref cchar* p, cchar* end)
89 in { auto p_ = p; assert(decode(p_, end) == ERROR_CHAR); }
90 body
91 {
92   UTF8Error error = UTF8Error.Invalid;
93   dchar c = *p;
94   assert(c >= 0x80);
95   if (!(++p < end && isTrailByte(*p)))
96     return UTF8Error.TrailByte;
97   if (c.In(0xE0, 0xF0, 0xF8, 0xFC) && (c & *p) == 0x80 ||
98       (c & 0xFE) == 0xC0) // 1100000x
99     return UTF8Error.Overlong;
100   if ((c & 0b1110_0000) == 0b1100_0000)
101   {}
102   else if ((c & 0b1111_0000) == 0b1110_0000)
103   {
104     if (!(p + 1 < end && isTrailByte(*++p)))
105       error = UTF8Error.TrailByte;
106   }
107   else if ((c & 0b1111_1000) == 0b1111_0000)
108   {
109     if (!(p + 2 < end && isTrailByte(*++p) && isTrailByte(*++p)))
110       error = UTF8Error.TrailByte;
111   }
112   else
113     error = UTF8Error.Over4Bytes;
114   return error;
115 }
116 /// Returns the precise error in a UTF-16 sequence.
117 UTF16Error utf16Error(cwstring s, ref size_t i)
118 {
119   auto p = s.ptr + i;
120   auto e = utf16Error(p, s.ptr + s.length);
121   i = p - s.ptr;
122   return e;
123 }
124 /// ditto
125 UTF16Error utf16Error(ref cwchar* p, cwchar* end)
126 in { auto p_ = p; assert(decode(p_, end) == ERROR_CHAR); }
127 body
128 {
129   dchar c = *p;
130   UTF16Error error = UTF16Error.LoSurrogate;
131   if (c > 0xDBFF)
132     error = UTF16Error.HiSurrogate;
133   else if (p+1 < end && 0xDC00 <= (c = p[1]) && c <= 0xDFFF)
134     (error = UTF16Error.Invalid), p++;
135   p++;
136   return error;
137 }
138 
139 // NB: All functions below advance the pointer/index only
140 //     when the decoded Unicode sequence was valid.
141 
142 /// Advances ref_p only if this is a valid Unicode alpha character.
143 /// Params:
144 ///   ref_p = Set to the last trail byte of the valid UTF-8 sequence.
145 /// Returns: The valid alpha character or 0.
146 dchar decodeUnicodeAlpha(ref cchar* ref_p, cchar* end)
147 in { assert(ref_p && ref_p < end); }
148 out(c) { assert(c == 0 || isUniAlpha(c)); }
149 body
150 {
151   dchar c = 0;
152   if (*ref_p >= 0x80)
153   {
154     auto p = ref_p;
155     c = decode(p, end);
156     if (isUniAlpha(c))
157       ref_p = p-1; // Subtract 1 because of decode().
158     else
159       c = 0;
160   }
161   return c;
162 }
163 
164 /// Returns true when p points to a valid Unicode alpha character
165 /// (also advances p.)
166 bool scanUnicodeAlpha(ref cchar* p, cchar* end)
167 {
168   return !!decodeUnicodeAlpha(p, end);
169 }
170 
171 /// Returns true when p points to a valid Unicode alpha character.
172 bool isUnicodeAlpha(cchar* p, cchar* end)
173 {
174   return !!decodeUnicodeAlpha(p, end);
175 }
176 
177 /// Decodes a character from str at index.
178 /// Params:
179 ///   index = Set to one past the ASCII char or one past the last trail byte
180 ///           of the valid UTF-8 sequence.
181 dchar decode(cstring str, ref size_t index)
182 in { assert(str.length && index < str.length); }
183 out { assert(index <= str.length); }
184 body
185 {
186   auto p = str.ptr + index;
187   auto end = str.ptr + str.length;
188   dchar c = decode(p, end);
189   if (c != ERROR_CHAR)
190     index = p - str.ptr;
191   return c;
192 }
193 
194 /// Decodes a character starting at ref_p.
195 /// Params:
196 ///   ref_p = Set to one past the ASCII char or one past the last trail byte
197 ///           of the valid UTF-8 sequence.
198 dchar decode(ref cchar* ref_p, cchar* end)
199 in { assert(ref_p && ref_p < end); }
200 out(c) { assert(ref_p <= end && (isValidChar(c) || c == ERROR_CHAR)); }
201 body
202 {
203   auto p = ref_p;
204   dchar c = *p;
205   char c2 = void;
206 
207   if (c < 0x80)
208     goto Lreturn; // ASCII character.
209 
210   // Error if: end of string or second byte is not a trail byte.
211   if (!(++p < end && isTrailByte(c2 = *p)))
212     goto Lerror;
213 
214   // Check for overlong sequences.
215   // 0xE0: c=11100000 c2=100xxxxx
216   // 0xF0: c=11110000 c2=1000xxxx
217   // 0xF8: c=11111000 c2=10000xxx
218   // 0xFC: c=11111100 c2=100000xx
219   if (c.In(0xE0, 0xF0, 0xF8, 0xFC) && (c & c2) == 0x80 ||
220       (c & 0xFE) == 0xC0) // 1100000x
221     goto Lerror;
222 
223   enum checkNextByte = "if (!isTrailByte(c2 = *++p))"
224                                "  goto Lerror;";
225   enum appendSixBits = "c = (c << 6) | c2 & 0b0011_1111;";
226 
227   // See how many bytes need to be decoded.
228   assert(p == ref_p+1, "p doesn't point to the second byte");
229   if ((c & 0b1110_0000) == 0b1100_0000)
230   { // 110xxxxx 10xxxxxx
231     c &= 0b0001_1111;
232     goto L2Bytes;
233   }
234   else if ((c & 0b1111_0000) == 0b1110_0000)
235   { // 1110xxxx 10xxxxxx 10xxxxxx
236     c &= 0b0000_1111;
237     if (p + 1 < end)
238       goto L3Bytes;
239   }
240   else if ((c & 0b1111_1000) == 0b1111_0000)
241   { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
242     c &= 0b0000_0111;
243     if (p + 2 < end)
244       goto L4Bytes;
245   }
246   else
247   { // 5 and 6 byte UTF-8 sequences are not allowed yet.
248     // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
249     // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
250   }
251   goto Lerror;
252 
253   // Decode the bytes now.
254 L4Bytes:
255   mixin(appendSixBits);
256   mixin(checkNextByte);
257 L3Bytes:
258   mixin(appendSixBits);
259   mixin(checkNextByte);
260 L2Bytes:
261   mixin(appendSixBits);
262 
263   assert(isTrailByte(c2));
264   if (!isValidChar(c)) // Final check for validity.
265     goto Lerror;
266 Lreturn:
267   ref_p = p+1; // Character is valid. Advance the pointer.
268   return c;
269 Lerror:
270   return ERROR_CHAR;
271 }
272 
273 /// Encodes c and appends it to str.
274 void encode(ref char[] str, dchar c)
275 {
276   assert(isValidChar(c), "check for valid character before calling encode().");
277 
278   char[6] b = void;
279   if (c < 0x80)
280     str ~= c;
281   else if (c < 0x800)
282   {
283     b[0] = 0xC0 | cast(char)(c >> 6);
284     b[1] = 0x80 | (c & 0x3F);
285     str ~= b[0..2];
286   }
287   else if (c < 0x10000)
288   {
289     b[0] = 0xE0 | cast(char)(c >> 12);
290     b[1] = 0x80 | ((c >> 6) & 0x3F);
291     b[2] = 0x80 | (c & 0x3F);
292     str ~= b[0..3];
293   }
294   else if (c < 0x200000)
295   {
296     b[0] = 0xF0 | (c >> 18);
297     b[1] = 0x80 | ((c >> 12) & 0x3F);
298     b[2] = 0x80 | ((c >> 6) & 0x3F);
299     b[3] = 0x80 | (c & 0x3F);
300     str ~= b[0..4];
301   }
302   /+ // There are no 5 and 6 byte UTF-8 sequences yet.
303   else if (c < 0x4000000)
304   {
305     b[0] = 0xF8 | (c >> 24);
306     b[1] = 0x80 | ((c >> 18) & 0x3F);
307     b[2] = 0x80 | ((c >> 12) & 0x3F);
308     b[3] = 0x80 | ((c >> 6) & 0x3F);
309     b[4] = 0x80 | (c & 0x3F);
310     str ~= b[0..5];
311   }
312   else if (c < 0x80000000)
313   {
314     b[0] = 0xFC | (c >> 30);
315     b[1] = 0x80 | ((c >> 24) & 0x3F);
316     b[2] = 0x80 | ((c >> 18) & 0x3F);
317     b[3] = 0x80 | ((c >> 12) & 0x3F);
318     b[4] = 0x80 | ((c >> 6) & 0x3F);
319     b[5] = 0x80 | (c & 0x3F);
320     str ~= b[0..6];
321   }
322   +/
323   else
324     assert(0);
325 }
326 
327 /// Writes the encoded character to a buffer that must be of sufficient length.
328 char[] encode(char* p, dchar c)
329 {
330   assert(isValidChar(c), "check for valid character before calling encode().");
331 
332   auto p0 = p;
333   if (c < 0x80)
334     *p++ = cast(char)c;
335   else if (c < 0x800)
336   {
337     *p++ = 0xC0 | cast(char)(c >> 6);
338     *p++ = 0x80 | (c & 0x3F);
339   }
340   else if (c < 0x10000)
341   {
342     *p++ = 0xE0 | cast(char)(c >> 12);
343     *p++ = 0x80 | ((c >> 6) & 0x3F);
344     *p++ = 0x80 | (c & 0x3F);
345   }
346   else if (c < 0x200000)
347   {
348     *p++ = 0xF0 | (c >> 18);
349     *p++ = 0x80 | ((c >> 12) & 0x3F);
350     *p++ = 0x80 | ((c >> 6) & 0x3F);
351     *p++ = 0x80 | (c & 0x3F);
352   }
353   /+ // There are no 5 and 6 byte UTF-8 sequences yet.
354   else if (c < 0x4000000)
355   {
356     *p++ = 0xF8 | (c >> 24);
357     *p++ = 0x80 | ((c >> 18) & 0x3F);
358     *p++ = 0x80 | ((c >> 12) & 0x3F);
359     *p++ = 0x80 | ((c >> 6) & 0x3F);
360     *p++ = 0x80 | (c & 0x3F);
361   }
362   else if (c < 0x80000000)
363   {
364     *p++ = 0xFC | (c >> 30);
365     *p++ = 0x80 | ((c >> 24) & 0x3F);
366     *p++ = 0x80 | ((c >> 18) & 0x3F);
367     *p++ = 0x80 | ((c >> 12) & 0x3F);
368     *p++ = 0x80 | ((c >> 6) & 0x3F);
369     *p++ = 0x80 | (c & 0x3F);
370   }
371   +/
372   else
373     assert(0);
374   return p0[0 .. p-p0];
375 }
376 
377 /// Encodes c and appends it to str.
378 void encode(ref wchar[] str, dchar c)
379 in { assert(isValidChar(c)); }
380 body
381 {
382   if (c < 0x10000)
383     str ~= cast(wchar)c;
384   else
385   { // Encode with surrogate pair.
386     wchar[2] pair = void;
387     c -= 0x10000; // c'
388     // higher10bits(c') | 0b1101_10xx_xxxx_xxxx
389     pair[0] = (c >> 10) | 0xD800;
390     // lower10bits(c') | 0b1101_11yy_yyyy_yyyy
391     pair[1] = (c & 0x3FF) | 0xDC00;
392     str ~= pair;
393   }
394 }
395 
396 /// Decodes a character from a UTF-16 sequence.
397 /// Params:
398 ///   str = The UTF-16 sequence.
399 ///   index = Where to start from.
400 /// Returns: ERROR_CHAR in case of an error in the sequence.
401 dchar decode(cwstring str, ref size_t index)
402 in { assert(str.length && index < str.length, "empty string or reached end"); }
403 out(c) { assert(index <= str.length && (isValidChar(c) || c == ERROR_CHAR)); }
404 body
405 {
406   dchar c = str[index];
407   if (0xD800 > c || c > 0xDFFF)
408     return ++index, c;
409   if (c <= 0xDBFF && index+1 < str.length)
410   {
411     wchar c2 = str[index+1];
412     if (0xDC00 <= c2 && c2 <= 0xDFFF)
413     { // Decode surrogate pair.
414       // (c - 0xD800) << 10 + 0x10000 ->
415       // (c - 0xD800 + 0x40) << 10 ->
416       c = (c - 0xD7C0) << 10;
417       c |= (c2 & 0x3FF);
418       if (isValidChar(c))
419         return (index += 2), c;
420     }
421   }
422   return ERROR_CHAR;
423 }
424 
425 /// Decodes a character from a UTF-16 sequence.
426 /// Params:
427 ///   p = Start of the UTF-16 sequence.
428 ///   end = One past the end of the sequence.
429 /// Returns: ERROR_CHAR in case of an error in the sequence.
430 dchar decode(ref cwchar* p, cwchar* end)
431 in { assert(p && p < end, "p is null or at the end of the string"); }
432 out(c) { assert(p <= end && (isValidChar(c) || c == ERROR_CHAR)); }
433 body
434 {
435   dchar c = *p;
436   if (0xD800 > c || c > 0xDFFF)
437     return ++p, c;
438   if (c <= 0xDBFF && p+1 < end)
439   {
440     wchar c2 = p[1];
441     if (0xDC00 <= c2 && c2 <= 0xDFFF)
442     {
443       c = (c - 0xD7C0) << 10;
444       c |= (c2 & 0x3FF);
445       if (isValidChar(c))
446         return (p += 2), c;
447     }
448   }
449   return ERROR_CHAR;
450 }
451 
452 /// Decodes a character from a zero-terminated UTF-16 string.
453 /// Params:
454 ///   p = Start of the UTF-16 sequence.
455 /// Returns: ERROR_CHAR in case of an error in the sequence.
456 dchar decode(ref cwchar* p)
457 in { assert(p && *p, "p is null or at the end of the string"); }
458 out(c) { assert(isValidChar(c) || c == ERROR_CHAR); }
459 body
460 {
461   assert(p);
462   dchar c = *p;
463   if (0xD800 > c || c > 0xDFFF)
464     return ++p, c;
465   if (c <= 0xDBFF)
466   {
467     wchar c2 = p[1];
468     if (0xDC00 <= c2 && c2 <= 0xDFFF)
469     {
470       c = (c - 0xD7C0) << 10;
471       c |= (c2 & 0x3FF);
472       if (isValidChar(c))
473         return (p += 2), c;
474     }
475   }
476   return ERROR_CHAR;
477 }
478 
479 /// Converts a string from type A to B.
480 B[] convertString(A, B)(const(A)[] str)
481 {
482   B[] result;
483   size_t idx, len = str.length;
484   while (idx < len)
485   {
486     auto c = decode(str, idx);
487     if (c == ERROR_CHAR)
488     { // Skip to valid lead char.
489       while (++idx < len && !isValidLead(str[idx]))
490       {}
491       c = REPLACEMENT_CHAR;
492     }
493     static if (is(B == dchar))
494       result ~= c; // Just append. No need for an encoding function.
495     else
496       encode(result, c);
497   }
498   return result;
499 }
500 
501 /// Converts a UTF-8 string to a UTF-16 string.
502 alias toUTF16 = convertString!(char, wchar);
503 /// Converts a UTF-8 string to a UTF-32 string.
504 alias toUTF32 = convertString!(char, dchar);
505 /// Converts a UTF-16 string to a UTF-8 string.
506 alias toUTF8 = convertString!(wchar, char);