1 // Written in the D programming language. 2 // 3 // Placed into the Public Domain. 4 // Digital Mars, www.digitalmars.com 5 // Written by Walter Bright 6 // Modified by Aziz Köksal 7 8 /// Simple Unicode character classification functions. 9 /// References: 10 /// $(LINK2 http://www.digitalmars.com/d/ascii-table.html, ASCII Table), 11 /// $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia), 12 /// $(LINK2 http://www.unicode.org, The Unicode Consortium) 13 /// Trademarks: 14 /// Unicode™ is a trademark of Unicode, Inc. 15 /// Copyright: 16 /// Public Domain. 17 module util.uni; 18 19 /// Returns non-zero if c is a Unicode lower case character. 20 int isUniLower(dchar c) 21 { 22 if (c <= 0x7F) 23 return (c >= 'a' && c <= 'z'); 24 return isUniAlpha(c) && c == toUniLower(c); 25 } 26 27 /// Returns non-zero if c is a Unicode upper case character. 28 int isUniUpper(dchar c) 29 { 30 if (c <= 0x7F) 31 return (c >= 'A' && c <= 'Z'); 32 return isUniAlpha(c) && c == toUniUpper(c); 33 } 34 35 /// If c is a Unicode upper case character, return the lower case 36 /// equivalent, otherwise return c. 37 dchar toUniLower(dchar c) 38 { 39 if (c >= 'A' && c <= 'Z') 40 c += 32; 41 else if (c >= 0x00C0) 42 { 43 if ((c >= 0x00C0 && c <= 0x00D6) || (c >= 0x00D8 && c <= 0x00DE)) 44 c += 32; 45 else if ((c >= 0x0100 && c < 0x0138) || (c > 0x0149 && c < 0x0178)) 46 { 47 if (c == 0x0130) 48 c = 0x0069; 49 else if ((c & 1) == 0) 50 c += 1; 51 } 52 else if (c == 0x0178) 53 c = 0x00FF; 54 else if ((c >= 0x0139 && c < 0x0149) || (c > 0x0178 && c < 0x017F)) 55 { 56 if (c & 1) 57 c += 1; 58 } 59 else if (c >= 0x0200 && c <= 0x0217) 60 { 61 if ((c & 1) == 0) 62 c += 1; 63 } 64 else if ((c >= 0x0401 && c <= 0x040C) || (c >= 0x040E && c <= 0x040F)) 65 c += 80; 66 else if (c >= 0x0410 && c <= 0x042F) 67 c += 32; 68 else if (c >= 0x0460 && c <= 0x047F) 69 { 70 if ((c & 1) == 0) 71 c += 1; 72 } 73 else if (c >= 0x0531 && c <= 0x0556) 74 c += 48; 75 else if (c >= 0x10A0 && c <= 0x10C5) 76 c += 48; 77 else if (c >= 0xFF21 && c <= 0xFF3A) 78 c += 32; 79 } 80 return c; 81 } 82 83 /// If c is a Unicode lower case character, return the upper case 84 /// equivalent, otherwise return c. 85 dchar toUniUpper(dchar c) 86 { 87 if (c >= 'a' && c <= 'z') 88 c -= 32; 89 else if (c >= 0x00E0) 90 { 91 if ((c >= 0x00E0 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FE)) 92 c -= 32; 93 else if (c == 0x00FF) 94 c = 0x0178; 95 else if ((c >= 0x0100 && c < 0x0138) || (c > 0x0149 && c < 0x0178)) 96 { 97 if (c == 0x0131) 98 c = 0x0049; 99 else if (c & 1) 100 c -= 1; 101 } 102 else if ((c >= 0x0139 && c < 0x0149) || (c > 0x0178 && c < 0x017F)) 103 { 104 if ((c & 1) == 0) 105 c -= 1; 106 } 107 else if (c == 0x017F) 108 c = 0x0053; 109 else if (c >= 0x0200 && c <= 0x0217) 110 { 111 if (c & 1) 112 c -= 1; 113 } 114 else if (c >= 0x0430 && c <= 0x044F) 115 c -= 32; 116 else if ((c >= 0x0451 && c <= 0x045C) || (c >= 0x045E && c <= 0x045F)) 117 c -= 80; 118 else if (c >= 0x0460 && c <= 0x047F) 119 { 120 if (c & 1) 121 c -= 1; 122 } 123 else if (c >= 0x0561 && c < 0x0587) 124 c -= 48; 125 else if (c >= 0xFF41 && c <= 0xFF5A) 126 c -= 32; 127 } 128 return c; 129 } 130 131 /// A sorted list of Unicode ranges that define valid UniAlpha characters. 132 static const dchar[2][] uniAlphaTable = [ 133 ['A', 'Z'], 134 ['a', 'z'], 135 [0x00AA, 0x00AA], 136 [0x00B5, 0x00B5], 137 [0x00BA, 0x00BA], 138 [0x00C0, 0x00D6], 139 [0x00D8, 0x00F6], 140 [0x00F8, 0x02C1], 141 [0x02C6, 0x02D1], 142 [0x02E0, 0x02E4], 143 [0x02EE, 0x02EE], 144 [0x037A, 0x037D], 145 [0x0386, 0x0386], 146 [0x0388, 0x038A], 147 [0x038C, 0x038C], 148 [0x038E, 0x03A1], 149 [0x03A3, 0x03CE], 150 [0x03D0, 0x03F5], 151 [0x03F7, 0x0481], 152 [0x048A, 0x0513], 153 [0x0531, 0x0556], 154 [0x0559, 0x0559], 155 [0x0561, 0x0587], 156 [0x05D0, 0x05EA], 157 [0x05F0, 0x05F2], 158 [0x0621, 0x063A], 159 [0x0640, 0x064A], 160 [0x066E, 0x066F], 161 [0x0671, 0x06D3], 162 [0x06D5, 0x06D5], 163 [0x06E5, 0x06E6], 164 [0x06EE, 0x06EF], 165 [0x06FA, 0x06FC], 166 [0x06FF, 0x06FF], 167 [0x0710, 0x0710], 168 [0x0712, 0x072F], 169 [0x074D, 0x076D], 170 [0x0780, 0x07A5], 171 [0x07B1, 0x07B1], 172 [0x07CA, 0x07EA], 173 [0x07F4, 0x07F5], 174 [0x07FA, 0x07FA], 175 [0x0904, 0x0939], 176 [0x093D, 0x093D], 177 [0x0950, 0x0950], 178 [0x0958, 0x0961], 179 [0x097B, 0x097F], 180 [0x0985, 0x098C], 181 [0x098F, 0x0990], 182 [0x0993, 0x09A8], 183 [0x09AA, 0x09B0], 184 [0x09B2, 0x09B2], 185 [0x09B6, 0x09B9], 186 [0x09BD, 0x09BD], 187 [0x09CE, 0x09CE], 188 [0x09DC, 0x09DD], 189 [0x09DF, 0x09E1], 190 [0x09F0, 0x09F1], 191 [0x0A05, 0x0A0A], 192 [0x0A0F, 0x0A10], 193 [0x0A13, 0x0A28], 194 [0x0A2A, 0x0A30], 195 [0x0A32, 0x0A33], 196 [0x0A35, 0x0A36], 197 [0x0A38, 0x0A39], 198 [0x0A59, 0x0A5C], 199 [0x0A5E, 0x0A5E], 200 [0x0A72, 0x0A74], 201 [0x0A85, 0x0A8D], 202 [0x0A8F, 0x0A91], 203 [0x0A93, 0x0AA8], 204 [0x0AAA, 0x0AB0], 205 [0x0AB2, 0x0AB3], 206 [0x0AB5, 0x0AB9], 207 [0x0ABD, 0x0ABD], 208 [0x0AD0, 0x0AD0], 209 [0x0AE0, 0x0AE1], 210 [0x0B05, 0x0B0C], 211 [0x0B0F, 0x0B10], 212 [0x0B13, 0x0B28], 213 [0x0B2A, 0x0B30], 214 [0x0B32, 0x0B33], 215 [0x0B35, 0x0B39], 216 [0x0B3D, 0x0B3D], 217 [0x0B5C, 0x0B5D], 218 [0x0B5F, 0x0B61], 219 [0x0B71, 0x0B71], 220 [0x0B83, 0x0B83], 221 [0x0B85, 0x0B8A], 222 [0x0B8E, 0x0B90], 223 [0x0B92, 0x0B95], 224 [0x0B99, 0x0B9A], 225 [0x0B9C, 0x0B9C], 226 [0x0B9E, 0x0B9F], 227 [0x0BA3, 0x0BA4], 228 [0x0BA8, 0x0BAA], 229 [0x0BAE, 0x0BB9], 230 [0x0C05, 0x0C0C], 231 [0x0C0E, 0x0C10], 232 [0x0C12, 0x0C28], 233 [0x0C2A, 0x0C33], 234 [0x0C35, 0x0C39], 235 [0x0C60, 0x0C61], 236 [0x0C85, 0x0C8C], 237 [0x0C8E, 0x0C90], 238 [0x0C92, 0x0CA8], 239 [0x0CAA, 0x0CB3], 240 [0x0CB5, 0x0CB9], 241 [0x0CBD, 0x0CBD], 242 [0x0CDE, 0x0CDE], 243 [0x0CE0, 0x0CE1], 244 [0x0D05, 0x0D0C], 245 [0x0D0E, 0x0D10], 246 [0x0D12, 0x0D28], 247 [0x0D2A, 0x0D39], 248 [0x0D60, 0x0D61], 249 [0x0D85, 0x0D96], 250 [0x0D9A, 0x0DB1], 251 [0x0DB3, 0x0DBB], 252 [0x0DBD, 0x0DBD], 253 [0x0DC0, 0x0DC6], 254 [0x0E01, 0x0E30], 255 [0x0E32, 0x0E33], 256 [0x0E40, 0x0E46], 257 [0x0E81, 0x0E82], 258 [0x0E84, 0x0E84], 259 [0x0E87, 0x0E88], 260 [0x0E8A, 0x0E8A], 261 [0x0E8D, 0x0E8D], 262 [0x0E94, 0x0E97], 263 [0x0E99, 0x0E9F], 264 [0x0EA1, 0x0EA3], 265 [0x0EA5, 0x0EA5], 266 [0x0EA7, 0x0EA7], 267 [0x0EAA, 0x0EAB], 268 [0x0EAD, 0x0EB0], 269 [0x0EB2, 0x0EB3], 270 [0x0EBD, 0x0EBD], 271 [0x0EC0, 0x0EC4], 272 [0x0EC6, 0x0EC6], 273 [0x0EDC, 0x0EDD], 274 [0x0F00, 0x0F00], 275 [0x0F40, 0x0F47], 276 [0x0F49, 0x0F6A], 277 [0x0F88, 0x0F8B], 278 [0x1000, 0x1021], 279 [0x1023, 0x1027], 280 [0x1029, 0x102A], 281 [0x1050, 0x1055], 282 [0x10A0, 0x10C5], 283 [0x10D0, 0x10FA], 284 [0x10FC, 0x10FC], 285 [0x1100, 0x1159], 286 [0x115F, 0x11A2], 287 [0x11A8, 0x11F9], 288 [0x1200, 0x1248], 289 [0x124A, 0x124D], 290 [0x1250, 0x1256], 291 [0x1258, 0x1258], 292 [0x125A, 0x125D], 293 [0x1260, 0x1288], 294 [0x128A, 0x128D], 295 [0x1290, 0x12B0], 296 [0x12B2, 0x12B5], 297 [0x12B8, 0x12BE], 298 [0x12C0, 0x12C0], 299 [0x12C2, 0x12C5], 300 [0x12C8, 0x12D6], 301 [0x12D8, 0x1310], 302 [0x1312, 0x1315], 303 [0x1318, 0x135A], 304 [0x1380, 0x138F], 305 [0x13A0, 0x13F4], 306 [0x1401, 0x166C], 307 [0x166F, 0x1676], 308 [0x1681, 0x169A], 309 [0x16A0, 0x16EA], 310 [0x1700, 0x170C], 311 [0x170E, 0x1711], 312 [0x1720, 0x1731], 313 [0x1740, 0x1751], 314 [0x1760, 0x176C], 315 [0x176E, 0x1770], 316 [0x1780, 0x17B3], 317 [0x17D7, 0x17D7], 318 [0x17DC, 0x17DC], 319 [0x1820, 0x1877], 320 [0x1880, 0x18A8], 321 [0x1900, 0x191C], 322 [0x1950, 0x196D], 323 [0x1970, 0x1974], 324 [0x1980, 0x19A9], 325 [0x19C1, 0x19C7], 326 [0x1A00, 0x1A16], 327 [0x1B05, 0x1B33], 328 [0x1B45, 0x1B4B], 329 [0x1D00, 0x1DBF], 330 [0x1E00, 0x1E9B], 331 [0x1EA0, 0x1EF9], 332 [0x1F00, 0x1F15], 333 [0x1F18, 0x1F1D], 334 [0x1F20, 0x1F45], 335 [0x1F48, 0x1F4D], 336 [0x1F50, 0x1F57], 337 [0x1F59, 0x1F59], 338 [0x1F5B, 0x1F5B], 339 [0x1F5D, 0x1F5D], 340 [0x1F5F, 0x1F7D], 341 [0x1F80, 0x1FB4], 342 [0x1FB6, 0x1FBC], 343 [0x1FBE, 0x1FBE], 344 [0x1FC2, 0x1FC4], 345 [0x1FC6, 0x1FCC], 346 [0x1FD0, 0x1FD3], 347 [0x1FD6, 0x1FDB], 348 [0x1FE0, 0x1FEC], 349 [0x1FF2, 0x1FF4], 350 [0x1FF6, 0x1FFC], 351 [0x2071, 0x2071], 352 [0x207F, 0x207F], 353 [0x2090, 0x2094], 354 [0x2102, 0x2102], 355 [0x2107, 0x2107], 356 [0x210A, 0x2113], 357 [0x2115, 0x2115], 358 [0x2119, 0x211D], 359 [0x2124, 0x2124], 360 [0x2126, 0x2126], 361 [0x2128, 0x2128], 362 [0x212A, 0x212D], 363 [0x212F, 0x2139], 364 [0x213C, 0x213F], 365 [0x2145, 0x2149], 366 [0x214E, 0x214E], 367 [0x2183, 0x2184], 368 [0x2C00, 0x2C2E], 369 [0x2C30, 0x2C5E], 370 [0x2C60, 0x2C6C], 371 [0x2C74, 0x2C77], 372 [0x2C80, 0x2CE4], 373 [0x2D00, 0x2D25], 374 [0x2D30, 0x2D65], 375 [0x2D6F, 0x2D6F], 376 [0x2D80, 0x2D96], 377 [0x2DA0, 0x2DA6], 378 [0x2DA8, 0x2DAE], 379 [0x2DB0, 0x2DB6], 380 [0x2DB8, 0x2DBE], 381 [0x2DC0, 0x2DC6], 382 [0x2DC8, 0x2DCE], 383 [0x2DD0, 0x2DD6], 384 [0x2DD8, 0x2DDE], 385 [0x3005, 0x3006], 386 [0x3031, 0x3035], 387 [0x303B, 0x303C], 388 [0x3041, 0x3096], 389 [0x309D, 0x309F], 390 [0x30A1, 0x30FA], 391 [0x30FC, 0x30FF], 392 [0x3105, 0x312C], 393 [0x3131, 0x318E], 394 [0x31A0, 0x31B7], 395 [0x31F0, 0x31FF], 396 [0x3400, 0x4DB5], 397 [0x4E00, 0x9FBB], 398 [0xA000, 0xA48C], 399 [0xA717, 0xA71A], 400 [0xA800, 0xA801], 401 [0xA803, 0xA805], 402 [0xA807, 0xA80A], 403 [0xA80C, 0xA822], 404 [0xA840, 0xA873], 405 [0xAC00, 0xD7A3], 406 [0xF900, 0xFA2D], 407 [0xFA30, 0xFA6A], 408 [0xFA70, 0xFAD9], 409 [0xFB00, 0xFB06], 410 [0xFB13, 0xFB17], 411 [0xFB1D, 0xFB1D], 412 [0xFB1F, 0xFB28], 413 [0xFB2A, 0xFB36], 414 [0xFB38, 0xFB3C], 415 [0xFB3E, 0xFB3E], 416 [0xFB40, 0xFB41], 417 [0xFB43, 0xFB44], 418 [0xFB46, 0xFBB1], 419 [0xFBD3, 0xFD3D], 420 [0xFD50, 0xFD8F], 421 [0xFD92, 0xFDC7], 422 [0xFDF0, 0xFDFB], 423 [0xFE70, 0xFE74], 424 [0xFE76, 0xFEFC], 425 [0xFF21, 0xFF3A], 426 [0xFF41, 0xFF5A], 427 [0xFF66, 0xFFBE], 428 [0xFFC2, 0xFFC7], 429 [0xFFCA, 0xFFCF], 430 [0xFFD2, 0xFFD7], 431 [0xFFDA, 0xFFDC], 432 [0x10000, 0x1000B], 433 [0x1000D, 0x10026], 434 [0x10028, 0x1003A], 435 [0x1003C, 0x1003D], 436 [0x1003F, 0x1004D], 437 [0x10050, 0x1005D], 438 [0x10080, 0x100FA], 439 [0x10300, 0x1031E], 440 [0x10330, 0x10340], 441 [0x10342, 0x10349], 442 [0x10380, 0x1039D], 443 [0x103A0, 0x103C3], 444 [0x103C8, 0x103CF], 445 [0x10400, 0x1049D], 446 [0x10800, 0x10805], 447 [0x10808, 0x10808], 448 [0x1080A, 0x10835], 449 [0x10837, 0x10838], 450 [0x1083C, 0x1083C], 451 [0x1083F, 0x1083F], 452 [0x10900, 0x10915], 453 [0x10A00, 0x10A00], 454 [0x10A10, 0x10A13], 455 [0x10A15, 0x10A17], 456 [0x10A19, 0x10A33], 457 [0x12000, 0x1236E], 458 [0x1D400, 0x1D454], 459 [0x1D456, 0x1D49C], 460 [0x1D49E, 0x1D49F], 461 [0x1D4A2, 0x1D4A2], 462 [0x1D4A5, 0x1D4A6], 463 [0x1D4A9, 0x1D4AC], 464 [0x1D4AE, 0x1D4B9], 465 [0x1D4BB, 0x1D4BB], 466 [0x1D4BD, 0x1D4C3], 467 [0x1D4C5, 0x1D505], 468 [0x1D507, 0x1D50A], 469 [0x1D50D, 0x1D514], 470 [0x1D516, 0x1D51C], 471 [0x1D51E, 0x1D539], 472 [0x1D53B, 0x1D53E], 473 [0x1D540, 0x1D544], 474 [0x1D546, 0x1D546], 475 [0x1D54A, 0x1D550], 476 [0x1D552, 0x1D6A5], 477 [0x1D6A8, 0x1D6C0], 478 [0x1D6C2, 0x1D6DA], 479 [0x1D6DC, 0x1D6FA], 480 [0x1D6FC, 0x1D714], 481 [0x1D716, 0x1D734], 482 [0x1D736, 0x1D74E], 483 [0x1D750, 0x1D76E], 484 [0x1D770, 0x1D788], 485 [0x1D78A, 0x1D7A8], 486 [0x1D7AA, 0x1D7C2], 487 [0x1D7C4, 0x1D7CB], 488 [0x20000, 0x2A6D6], 489 [0x2F800, 0x2FA1D], 490 ]; 491 492 493 /// Returns non-zero if u is a Unicode alpha character. 494 /// (General Unicode category: Lu, Ll, Lt, Lm and Lo) 495 /// 496 /// Standards: Unicode 5.0.0 497 int isUniAlpha(dchar u) 498 out(found) 499 { 500 debug 501 { 502 bool inTable() 503 { 504 foreach (i, range; uniAlphaTable) 505 if (range[0] <= u && u <= range[1]) 506 return true; 507 return false; 508 } 509 assert(!!inTable() == !!found); 510 } 511 } 512 body 513 { 514 alias table = uniAlphaTable; 515 if (u < 0xAA && ('A' <= u && u <= 'Z' || 'a' <= u && u <= 'z')) 516 return 1; // Quick path for ASCII letters. 517 // Binary search the table: 518 size_t mid = void, 519 low = 0, 520 high = table.length; 521 static assert(table.length < size_t.max / 2, "'mid' may overflow!"); 522 while (low < high) 523 { 524 mid = (low + high) / 2; 525 auto range = table[mid]; 526 if (u < range[0]) // The char is below the range. 527 high = mid; 528 else if (u > range[1]) // The char is above the range. 529 low = mid + 1; 530 else // The char is inside the range. 531 return 1; 532 } 533 assert(high == low); 534 return 0; 535 } 536 537 void testIsUniAlpha() 538 { 539 import common; 540 scope msg = new UnittestMsg("Testing function isUniAlpha()."); 541 542 // Check correctness of the ranges. 543 foreach (i, range; uniAlphaTable) 544 { 545 assert(range[0] <= range[1]); 546 if (i < uniAlphaTable.length - 1) 547 assert(range[1] < uniAlphaTable[i + 1][0]); 548 } 549 550 // Check ASCII alphabet. 551 for (uint i; i < 0x80; i++) 552 { 553 auto isUA = isUniAlpha(i); 554 assert(('A' <= i && i <= 'Z' || 555 'a' <= i && i <= 'z') ? 556 isUA : !isUA); 557 } 558 // Check some Unicode characters. 559 dchar[] unichars = [0x0C8C, 0x0E81, 0x0EBD, 0x0F88, 560 0x10FC, 0x17B3, 0xFB1F, 0xFE74, 0xFFDA, 0x10000, 0x103C3, 0x10837, 561 0x1D454, 0x1D4BB, 0x1D53E, 0x1D6A8, 0x1D74E, 0x1D78A, 0x1D7CB, 0x2F800]; 562 foreach (u; unichars) 563 assert(isUniAlpha(u), "expected char to be a unialpha"); 564 }