1 /// Author: Aziz Köksal 2 /// License: GPL3 3 /// $(Maturity very high) 4 module dil.Unicode; 5 6 import common; 7 8 public import util.uni : isUniAlpha; 9 10 /// U+FFFD = �. Used to replace invalid Unicode characters. 11 enum cdchar REPLACEMENT_CHAR = '\uFFFD'; 12 enum cchar[3] REPLACEMENT_STR = "\uFFFD"; /// Ditto 13 /// Invalid character, returned on errors. 14 enum dchar ERROR_CHAR = 0xD800; 15 16 /// Returns: true if this character is not a surrogate 17 /// code point and not higher than 0x10FFFF. 18 bool isValidChar(dchar d) 19 { 20 return d < 0xD800 || (d > 0xDFFF && d <= 0x10FFFF); 21 } 22 23 /// There are a total of 66 noncharacters. 24 /// Returns: true if this is one of them. 25 /// See_also: Chapter 16.7 Noncharacters in Unicode 5.0 26 bool isNoncharacter(dchar d) 27 { 28 return 0xFDD0 <= d && d <= 0xFDEF || // 32 29 d <= 0x10FFFF && (d & 0xFFFF) >= 0xFFFE; // 34 30 } 31 32 /// Returns: true if this is a trail byte of a UTF-8 sequence. 33 bool isTrailByte(ubyte b) 34 { 35 return (b & 0xC0) == 0x80; // 10xx_xxxx 36 } 37 38 /// Returns: true if this is a lead byte of a UTF-8 sequence. 39 bool isLeadByte(ubyte b) 40 { 41 return (b & 0xC0) == 0xC0; // 11xx_xxxx 42 } 43 44 // FIXME: isValidLead() should return true for ASCII as well. 45 /// Returns: true if c is a valid lead character. 46 bool isValidLead(char c) 47 { // NB: not all overlong sequences are checked. 48 return (c & 0xC0) == 0xC0 && (c & 0xFE) != 0xC0; 49 } 50 51 /// ditto 52 bool isValidLead(wchar c) 53 { 54 return c <= 0xDBFF || c > 0xDFFF; 55 } 56 57 /// ditto 58 bool isValidLead(dchar c) 59 { 60 return isValidChar(c); 61 } 62 63 /// Enumeration of errors related to decoding UTF-8 sequences. 64 enum UTF8Error 65 { 66 Invalid, /// The correctly decoded character is invalid. 67 Overlong, /// Overlong sequence (must be encoded with fewer bytes.) 68 TrailByte, /// Missing trail byte. 69 Over4Bytes, /// The sequence is longer than 4 bytes. 70 } 71 /// Enumeration of errors related to decoding UTF-16 sequences. 72 enum UTF16Error 73 { 74 Invalid, /// The correctly decoded character is invalid. 75 LoSurrogate, /// Missing low surrogate wchar. 76 HiSurrogate, /// Missing high surrogate wchar. 77 } 78 79 /// Returns the precise error in a UTF-8 sequence. 80 UTF8Error utf8Error(cstring s, ref size_t i) 81 { 82 auto p = s.ptr + i; 83 auto e = utf8Error(p, s.ptr + s.length); 84 i = p - s.ptr; 85 return e; 86 } 87 /// ditto 88 UTF8Error utf8Error(ref cchar* p, cchar* end) 89 in { auto p_ = p; assert(decode(p_, end) == ERROR_CHAR); } 90 body 91 { 92 UTF8Error error = UTF8Error.Invalid; 93 dchar c = *p; 94 assert(c >= 0x80); 95 if (!(++p < end && isTrailByte(*p))) 96 return UTF8Error.TrailByte; 97 if (c.In(0xE0, 0xF0, 0xF8, 0xFC) && (c & *p) == 0x80 || 98 (c & 0xFE) == 0xC0) // 1100000x 99 return UTF8Error.Overlong; 100 if ((c & 0b1110_0000) == 0b1100_0000) 101 {} 102 else if ((c & 0b1111_0000) == 0b1110_0000) 103 { 104 if (!(p + 1 < end && isTrailByte(*++p))) 105 error = UTF8Error.TrailByte; 106 } 107 else if ((c & 0b1111_1000) == 0b1111_0000) 108 { 109 if (!(p + 2 < end && isTrailByte(*++p) && isTrailByte(*++p))) 110 error = UTF8Error.TrailByte; 111 } 112 else 113 error = UTF8Error.Over4Bytes; 114 return error; 115 } 116 /// Returns the precise error in a UTF-16 sequence. 117 UTF16Error utf16Error(cwstring s, ref size_t i) 118 { 119 auto p = s.ptr + i; 120 auto e = utf16Error(p, s.ptr + s.length); 121 i = p - s.ptr; 122 return e; 123 } 124 /// ditto 125 UTF16Error utf16Error(ref cwchar* p, cwchar* end) 126 in { auto p_ = p; assert(decode(p_, end) == ERROR_CHAR); } 127 body 128 { 129 dchar c = *p; 130 UTF16Error error = UTF16Error.LoSurrogate; 131 if (c > 0xDBFF) 132 error = UTF16Error.HiSurrogate; 133 else if (p+1 < end && 0xDC00 <= (c = p[1]) && c <= 0xDFFF) 134 (error = UTF16Error.Invalid), p++; 135 p++; 136 return error; 137 } 138 139 // NB: All functions below advance the pointer/index only 140 // when the decoded Unicode sequence was valid. 141 142 /// Advances ref_p only if this is a valid Unicode alpha character. 143 /// Params: 144 /// ref_p = Set to the last trail byte of the valid UTF-8 sequence. 145 /// Returns: The valid alpha character or 0. 146 dchar decodeUnicodeAlpha(ref cchar* ref_p, cchar* end) 147 in { assert(ref_p && ref_p < end); } 148 out(c) { assert(c == 0 || isUniAlpha(c)); } 149 body 150 { 151 dchar c = 0; 152 if (*ref_p >= 0x80) 153 { 154 auto p = ref_p; 155 c = decode(p, end); 156 if (isUniAlpha(c)) 157 ref_p = p-1; // Subtract 1 because of decode(). 158 else 159 c = 0; 160 } 161 return c; 162 } 163 164 /// Returns true when p points to a valid Unicode alpha character 165 /// (also advances p.) 166 bool scanUnicodeAlpha(ref cchar* p, cchar* end) 167 { 168 return !!decodeUnicodeAlpha(p, end); 169 } 170 171 /// Returns true when p points to a valid Unicode alpha character. 172 bool isUnicodeAlpha(cchar* p, cchar* end) 173 { 174 return !!decodeUnicodeAlpha(p, end); 175 } 176 177 /// Decodes a character from str at index. 178 /// Params: 179 /// index = Set to one past the ASCII char or one past the last trail byte 180 /// of the valid UTF-8 sequence. 181 dchar decode(cstring str, ref size_t index) 182 in { assert(str.length && index < str.length); } 183 out { assert(index <= str.length); } 184 body 185 { 186 auto p = str.ptr + index; 187 auto end = str.ptr + str.length; 188 dchar c = decode(p, end); 189 if (c != ERROR_CHAR) 190 index = p - str.ptr; 191 return c; 192 } 193 194 /// Decodes a character starting at ref_p. 195 /// Params: 196 /// ref_p = Set to one past the ASCII char or one past the last trail byte 197 /// of the valid UTF-8 sequence. 198 dchar decode(ref cchar* ref_p, cchar* end) 199 in { assert(ref_p && ref_p < end); } 200 out(c) { assert(ref_p <= end && (isValidChar(c) || c == ERROR_CHAR)); } 201 body 202 { 203 auto p = ref_p; 204 dchar c = *p; 205 char c2 = void; 206 207 if (c < 0x80) 208 goto Lreturn; // ASCII character. 209 210 // Error if: end of string or second byte is not a trail byte. 211 if (!(++p < end && isTrailByte(c2 = *p))) 212 goto Lerror; 213 214 // Check for overlong sequences. 215 // 0xE0: c=11100000 c2=100xxxxx 216 // 0xF0: c=11110000 c2=1000xxxx 217 // 0xF8: c=11111000 c2=10000xxx 218 // 0xFC: c=11111100 c2=100000xx 219 if (c.In(0xE0, 0xF0, 0xF8, 0xFC) && (c & c2) == 0x80 || 220 (c & 0xFE) == 0xC0) // 1100000x 221 goto Lerror; 222 223 enum checkNextByte = "if (!isTrailByte(c2 = *++p))" 224 " goto Lerror;"; 225 enum appendSixBits = "c = (c << 6) | c2 & 0b0011_1111;"; 226 227 // See how many bytes need to be decoded. 228 assert(p == ref_p+1, "p doesn't point to the second byte"); 229 if ((c & 0b1110_0000) == 0b1100_0000) 230 { // 110xxxxx 10xxxxxx 231 c &= 0b0001_1111; 232 goto L2Bytes; 233 } 234 else if ((c & 0b1111_0000) == 0b1110_0000) 235 { // 1110xxxx 10xxxxxx 10xxxxxx 236 c &= 0b0000_1111; 237 if (p + 1 < end) 238 goto L3Bytes; 239 } 240 else if ((c & 0b1111_1000) == 0b1111_0000) 241 { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 242 c &= 0b0000_0111; 243 if (p + 2 < end) 244 goto L4Bytes; 245 } 246 else 247 { // 5 and 6 byte UTF-8 sequences are not allowed yet. 248 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 249 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 250 } 251 goto Lerror; 252 253 // Decode the bytes now. 254 L4Bytes: 255 mixin(appendSixBits); 256 mixin(checkNextByte); 257 L3Bytes: 258 mixin(appendSixBits); 259 mixin(checkNextByte); 260 L2Bytes: 261 mixin(appendSixBits); 262 263 assert(isTrailByte(c2)); 264 if (!isValidChar(c)) // Final check for validity. 265 goto Lerror; 266 Lreturn: 267 ref_p = p+1; // Character is valid. Advance the pointer. 268 return c; 269 Lerror: 270 return ERROR_CHAR; 271 } 272 273 /// Encodes c and appends it to str. 274 void encode(ref char[] str, dchar c) 275 { 276 assert(isValidChar(c), "check for valid character before calling encode()."); 277 278 char[6] b = void; 279 if (c < 0x80) 280 str ~= c; 281 else if (c < 0x800) 282 { 283 b[0] = 0xC0 | cast(char)(c >> 6); 284 b[1] = 0x80 | (c & 0x3F); 285 str ~= b[0..2]; 286 } 287 else if (c < 0x10000) 288 { 289 b[0] = 0xE0 | cast(char)(c >> 12); 290 b[1] = 0x80 | ((c >> 6) & 0x3F); 291 b[2] = 0x80 | (c & 0x3F); 292 str ~= b[0..3]; 293 } 294 else if (c < 0x200000) 295 { 296 b[0] = 0xF0 | (c >> 18); 297 b[1] = 0x80 | ((c >> 12) & 0x3F); 298 b[2] = 0x80 | ((c >> 6) & 0x3F); 299 b[3] = 0x80 | (c & 0x3F); 300 str ~= b[0..4]; 301 } 302 /+ // There are no 5 and 6 byte UTF-8 sequences yet. 303 else if (c < 0x4000000) 304 { 305 b[0] = 0xF8 | (c >> 24); 306 b[1] = 0x80 | ((c >> 18) & 0x3F); 307 b[2] = 0x80 | ((c >> 12) & 0x3F); 308 b[3] = 0x80 | ((c >> 6) & 0x3F); 309 b[4] = 0x80 | (c & 0x3F); 310 str ~= b[0..5]; 311 } 312 else if (c < 0x80000000) 313 { 314 b[0] = 0xFC | (c >> 30); 315 b[1] = 0x80 | ((c >> 24) & 0x3F); 316 b[2] = 0x80 | ((c >> 18) & 0x3F); 317 b[3] = 0x80 | ((c >> 12) & 0x3F); 318 b[4] = 0x80 | ((c >> 6) & 0x3F); 319 b[5] = 0x80 | (c & 0x3F); 320 str ~= b[0..6]; 321 } 322 +/ 323 else 324 assert(0); 325 } 326 327 /// Writes the encoded character to a buffer that must be of sufficient length. 328 char[] encode(char* p, dchar c) 329 { 330 assert(isValidChar(c), "check for valid character before calling encode()."); 331 332 auto p0 = p; 333 if (c < 0x80) 334 *p++ = cast(char)c; 335 else if (c < 0x800) 336 { 337 *p++ = 0xC0 | cast(char)(c >> 6); 338 *p++ = 0x80 | (c & 0x3F); 339 } 340 else if (c < 0x10000) 341 { 342 *p++ = 0xE0 | cast(char)(c >> 12); 343 *p++ = 0x80 | ((c >> 6) & 0x3F); 344 *p++ = 0x80 | (c & 0x3F); 345 } 346 else if (c < 0x200000) 347 { 348 *p++ = 0xF0 | (c >> 18); 349 *p++ = 0x80 | ((c >> 12) & 0x3F); 350 *p++ = 0x80 | ((c >> 6) & 0x3F); 351 *p++ = 0x80 | (c & 0x3F); 352 } 353 /+ // There are no 5 and 6 byte UTF-8 sequences yet. 354 else if (c < 0x4000000) 355 { 356 *p++ = 0xF8 | (c >> 24); 357 *p++ = 0x80 | ((c >> 18) & 0x3F); 358 *p++ = 0x80 | ((c >> 12) & 0x3F); 359 *p++ = 0x80 | ((c >> 6) & 0x3F); 360 *p++ = 0x80 | (c & 0x3F); 361 } 362 else if (c < 0x80000000) 363 { 364 *p++ = 0xFC | (c >> 30); 365 *p++ = 0x80 | ((c >> 24) & 0x3F); 366 *p++ = 0x80 | ((c >> 18) & 0x3F); 367 *p++ = 0x80 | ((c >> 12) & 0x3F); 368 *p++ = 0x80 | ((c >> 6) & 0x3F); 369 *p++ = 0x80 | (c & 0x3F); 370 } 371 +/ 372 else 373 assert(0); 374 return p0[0 .. p-p0]; 375 } 376 377 /// Encodes c and appends it to str. 378 void encode(ref wchar[] str, dchar c) 379 in { assert(isValidChar(c)); } 380 body 381 { 382 if (c < 0x10000) 383 str ~= cast(wchar)c; 384 else 385 { // Encode with surrogate pair. 386 wchar[2] pair = void; 387 c -= 0x10000; // c' 388 // higher10bits(c') | 0b1101_10xx_xxxx_xxxx 389 pair[0] = (c >> 10) | 0xD800; 390 // lower10bits(c') | 0b1101_11yy_yyyy_yyyy 391 pair[1] = (c & 0x3FF) | 0xDC00; 392 str ~= pair; 393 } 394 } 395 396 /// Decodes a character from a UTF-16 sequence. 397 /// Params: 398 /// str = The UTF-16 sequence. 399 /// index = Where to start from. 400 /// Returns: ERROR_CHAR in case of an error in the sequence. 401 dchar decode(cwstring str, ref size_t index) 402 in { assert(str.length && index < str.length, "empty string or reached end"); } 403 out(c) { assert(index <= str.length && (isValidChar(c) || c == ERROR_CHAR)); } 404 body 405 { 406 dchar c = str[index]; 407 if (0xD800 > c || c > 0xDFFF) 408 return ++index, c; 409 if (c <= 0xDBFF && index+1 < str.length) 410 { 411 wchar c2 = str[index+1]; 412 if (0xDC00 <= c2 && c2 <= 0xDFFF) 413 { // Decode surrogate pair. 414 // (c - 0xD800) << 10 + 0x10000 -> 415 // (c - 0xD800 + 0x40) << 10 -> 416 c = (c - 0xD7C0) << 10; 417 c |= (c2 & 0x3FF); 418 if (isValidChar(c)) 419 return (index += 2), c; 420 } 421 } 422 return ERROR_CHAR; 423 } 424 425 /// Decodes a character from a UTF-16 sequence. 426 /// Params: 427 /// p = Start of the UTF-16 sequence. 428 /// end = One past the end of the sequence. 429 /// Returns: ERROR_CHAR in case of an error in the sequence. 430 dchar decode(ref cwchar* p, cwchar* end) 431 in { assert(p && p < end, "p is null or at the end of the string"); } 432 out(c) { assert(p <= end && (isValidChar(c) || c == ERROR_CHAR)); } 433 body 434 { 435 dchar c = *p; 436 if (0xD800 > c || c > 0xDFFF) 437 return ++p, c; 438 if (c <= 0xDBFF && p+1 < end) 439 { 440 wchar c2 = p[1]; 441 if (0xDC00 <= c2 && c2 <= 0xDFFF) 442 { 443 c = (c - 0xD7C0) << 10; 444 c |= (c2 & 0x3FF); 445 if (isValidChar(c)) 446 return (p += 2), c; 447 } 448 } 449 return ERROR_CHAR; 450 } 451 452 /// Decodes a character from a zero-terminated UTF-16 string. 453 /// Params: 454 /// p = Start of the UTF-16 sequence. 455 /// Returns: ERROR_CHAR in case of an error in the sequence. 456 dchar decode(ref cwchar* p) 457 in { assert(p && *p, "p is null or at the end of the string"); } 458 out(c) { assert(isValidChar(c) || c == ERROR_CHAR); } 459 body 460 { 461 assert(p); 462 dchar c = *p; 463 if (0xD800 > c || c > 0xDFFF) 464 return ++p, c; 465 if (c <= 0xDBFF) 466 { 467 wchar c2 = p[1]; 468 if (0xDC00 <= c2 && c2 <= 0xDFFF) 469 { 470 c = (c - 0xD7C0) << 10; 471 c |= (c2 & 0x3FF); 472 if (isValidChar(c)) 473 return (p += 2), c; 474 } 475 } 476 return ERROR_CHAR; 477 } 478 479 /// Converts a string from type A to B. 480 B[] convertString(A, B)(const(A)[] str) 481 { 482 B[] result; 483 size_t idx, len = str.length; 484 while (idx < len) 485 { 486 auto c = decode(str, idx); 487 if (c == ERROR_CHAR) 488 { // Skip to valid lead char. 489 while (++idx < len && !isValidLead(str[idx])) 490 {} 491 c = REPLACEMENT_CHAR; 492 } 493 static if (is(B == dchar)) 494 result ~= c; // Just append. No need for an encoding function. 495 else 496 encode(result, c); 497 } 498 return result; 499 } 500 501 /// Converts a UTF-8 string to a UTF-16 string. 502 alias toUTF16 = convertString!(char, wchar); 503 /// Converts a UTF-8 string to a UTF-32 string. 504 alias toUTF32 = convertString!(char, dchar); 505 /// Converts a UTF-16 string to a UTF-8 string. 506 alias toUTF8 = convertString!(wchar, char);