1 // Written in the D programming language.
2 //
3 // Placed into the Public Domain.
4 // Digital Mars, www.digitalmars.com
5 // Written by Walter Bright
6 // Modified by Aziz Köksal
7 
8 /// Simple Unicode character classification functions.
9 /// References:
10 ///   $(LINK2 http://www.digitalmars.com/d/ascii-table.html, ASCII Table),
11 ///   $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia),
12 ///   $(LINK2 http://www.unicode.org, The Unicode Consortium)
13 /// Trademarks:
14 ///   Unicode™ is a trademark of Unicode, Inc.
15 /// Copyright:
16 ///   Public Domain.
17 module util.uni;
18 
19 /// Returns non-zero if c is a Unicode lower case character.
20 int isUniLower(dchar c)
21 {
22   if (c <= 0x7F)
23     return (c >= 'a' && c <= 'z');
24   return isUniAlpha(c) && c == toUniLower(c);
25 }
26 
27 /// Returns non-zero if c is a Unicode upper case character.
28 int isUniUpper(dchar c)
29 {
30   if (c <= 0x7F)
31     return (c >= 'A' && c <= 'Z');
32   return isUniAlpha(c) && c == toUniUpper(c);
33 }
34 
35 /// If c is a Unicode upper case character, return the lower case
36 /// equivalent, otherwise return c.
37 dchar toUniLower(dchar c)
38 {
39   if (c >= 'A' && c <= 'Z')
40     c += 32;
41   else if (c >= 0x00C0)
42   {
43     if ((c >= 0x00C0 && c <= 0x00D6) || (c >= 0x00D8 && c <= 0x00DE))
44       c += 32;
45     else if ((c >= 0x0100 && c < 0x0138) || (c > 0x0149 && c < 0x0178))
46     {
47       if (c == 0x0130)
48         c = 0x0069;
49       else if ((c & 1) == 0)
50         c += 1;
51     }
52     else if (c == 0x0178)
53       c = 0x00FF;
54     else if ((c >= 0x0139 && c < 0x0149) || (c > 0x0178 && c < 0x017F))
55     {
56       if (c & 1)
57         c += 1;
58     }
59     else if (c >= 0x0200 && c <= 0x0217)
60     {
61       if ((c & 1) == 0)
62         c += 1;
63     }
64     else if ((c >= 0x0401 && c <= 0x040C) || (c >= 0x040E && c <= 0x040F))
65       c += 80;
66     else if (c >= 0x0410  && c <= 0x042F)
67       c += 32;
68     else if (c >= 0x0460 && c <= 0x047F)
69     {
70       if ((c & 1) == 0)
71         c += 1;
72     }
73     else if (c >= 0x0531 && c <= 0x0556)
74       c += 48;
75     else if (c >= 0x10A0 && c <= 0x10C5)
76       c += 48;
77     else if (c >= 0xFF21 && c <= 0xFF3A)
78       c += 32;
79   }
80   return c;
81 }
82 
83 /// If c is a Unicode lower case character, return the upper case
84 /// equivalent, otherwise return c.
85 dchar toUniUpper(dchar c)
86 {
87   if (c >= 'a' && c <= 'z')
88     c -= 32;
89   else if (c >= 0x00E0)
90   {
91     if ((c >= 0x00E0 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FE))
92       c -= 32;
93     else if (c == 0x00FF)
94       c = 0x0178;
95     else if ((c >= 0x0100 && c < 0x0138) || (c > 0x0149 && c < 0x0178))
96     {
97       if (c == 0x0131)
98         c = 0x0049;
99       else if (c & 1)
100         c -= 1;
101     }
102     else if ((c >= 0x0139 && c < 0x0149) || (c > 0x0178 && c < 0x017F))
103     {
104       if ((c & 1) == 0)
105         c -= 1;
106     }
107     else if (c == 0x017F)
108       c = 0x0053;
109     else if (c >= 0x0200 && c <= 0x0217)
110     {
111       if (c & 1)
112         c -= 1;
113     }
114     else if (c >= 0x0430 && c <= 0x044F)
115       c -= 32;
116     else if ((c >= 0x0451 && c <= 0x045C) || (c >= 0x045E && c <= 0x045F))
117       c -= 80;
118     else if (c >= 0x0460 && c <= 0x047F)
119     {
120       if (c & 1)
121         c -= 1;
122     }
123     else if (c >= 0x0561 && c < 0x0587)
124       c -= 48;
125     else if (c >= 0xFF41 && c <= 0xFF5A)
126       c -= 32;
127   }
128   return c;
129 }
130 
131 /// A sorted list of Unicode ranges that define valid UniAlpha characters.
132 static const dchar[2][] uniAlphaTable = [
133   ['A', 'Z'],
134   ['a', 'z'],
135   [0x00AA, 0x00AA],
136   [0x00B5, 0x00B5],
137   [0x00BA, 0x00BA],
138   [0x00C0, 0x00D6],
139   [0x00D8, 0x00F6],
140   [0x00F8, 0x02C1],
141   [0x02C6, 0x02D1],
142   [0x02E0, 0x02E4],
143   [0x02EE, 0x02EE],
144   [0x037A, 0x037D],
145   [0x0386, 0x0386],
146   [0x0388, 0x038A],
147   [0x038C, 0x038C],
148   [0x038E, 0x03A1],
149   [0x03A3, 0x03CE],
150   [0x03D0, 0x03F5],
151   [0x03F7, 0x0481],
152   [0x048A, 0x0513],
153   [0x0531, 0x0556],
154   [0x0559, 0x0559],
155   [0x0561, 0x0587],
156   [0x05D0, 0x05EA],
157   [0x05F0, 0x05F2],
158   [0x0621, 0x063A],
159   [0x0640, 0x064A],
160   [0x066E, 0x066F],
161   [0x0671, 0x06D3],
162   [0x06D5, 0x06D5],
163   [0x06E5, 0x06E6],
164   [0x06EE, 0x06EF],
165   [0x06FA, 0x06FC],
166   [0x06FF, 0x06FF],
167   [0x0710, 0x0710],
168   [0x0712, 0x072F],
169   [0x074D, 0x076D],
170   [0x0780, 0x07A5],
171   [0x07B1, 0x07B1],
172   [0x07CA, 0x07EA],
173   [0x07F4, 0x07F5],
174   [0x07FA, 0x07FA],
175   [0x0904, 0x0939],
176   [0x093D, 0x093D],
177   [0x0950, 0x0950],
178   [0x0958, 0x0961],
179   [0x097B, 0x097F],
180   [0x0985, 0x098C],
181   [0x098F, 0x0990],
182   [0x0993, 0x09A8],
183   [0x09AA, 0x09B0],
184   [0x09B2, 0x09B2],
185   [0x09B6, 0x09B9],
186   [0x09BD, 0x09BD],
187   [0x09CE, 0x09CE],
188   [0x09DC, 0x09DD],
189   [0x09DF, 0x09E1],
190   [0x09F0, 0x09F1],
191   [0x0A05, 0x0A0A],
192   [0x0A0F, 0x0A10],
193   [0x0A13, 0x0A28],
194   [0x0A2A, 0x0A30],
195   [0x0A32, 0x0A33],
196   [0x0A35, 0x0A36],
197   [0x0A38, 0x0A39],
198   [0x0A59, 0x0A5C],
199   [0x0A5E, 0x0A5E],
200   [0x0A72, 0x0A74],
201   [0x0A85, 0x0A8D],
202   [0x0A8F, 0x0A91],
203   [0x0A93, 0x0AA8],
204   [0x0AAA, 0x0AB0],
205   [0x0AB2, 0x0AB3],
206   [0x0AB5, 0x0AB9],
207   [0x0ABD, 0x0ABD],
208   [0x0AD0, 0x0AD0],
209   [0x0AE0, 0x0AE1],
210   [0x0B05, 0x0B0C],
211   [0x0B0F, 0x0B10],
212   [0x0B13, 0x0B28],
213   [0x0B2A, 0x0B30],
214   [0x0B32, 0x0B33],
215   [0x0B35, 0x0B39],
216   [0x0B3D, 0x0B3D],
217   [0x0B5C, 0x0B5D],
218   [0x0B5F, 0x0B61],
219   [0x0B71, 0x0B71],
220   [0x0B83, 0x0B83],
221   [0x0B85, 0x0B8A],
222   [0x0B8E, 0x0B90],
223   [0x0B92, 0x0B95],
224   [0x0B99, 0x0B9A],
225   [0x0B9C, 0x0B9C],
226   [0x0B9E, 0x0B9F],
227   [0x0BA3, 0x0BA4],
228   [0x0BA8, 0x0BAA],
229   [0x0BAE, 0x0BB9],
230   [0x0C05, 0x0C0C],
231   [0x0C0E, 0x0C10],
232   [0x0C12, 0x0C28],
233   [0x0C2A, 0x0C33],
234   [0x0C35, 0x0C39],
235   [0x0C60, 0x0C61],
236   [0x0C85, 0x0C8C],
237   [0x0C8E, 0x0C90],
238   [0x0C92, 0x0CA8],
239   [0x0CAA, 0x0CB3],
240   [0x0CB5, 0x0CB9],
241   [0x0CBD, 0x0CBD],
242   [0x0CDE, 0x0CDE],
243   [0x0CE0, 0x0CE1],
244   [0x0D05, 0x0D0C],
245   [0x0D0E, 0x0D10],
246   [0x0D12, 0x0D28],
247   [0x0D2A, 0x0D39],
248   [0x0D60, 0x0D61],
249   [0x0D85, 0x0D96],
250   [0x0D9A, 0x0DB1],
251   [0x0DB3, 0x0DBB],
252   [0x0DBD, 0x0DBD],
253   [0x0DC0, 0x0DC6],
254   [0x0E01, 0x0E30],
255   [0x0E32, 0x0E33],
256   [0x0E40, 0x0E46],
257   [0x0E81, 0x0E82],
258   [0x0E84, 0x0E84],
259   [0x0E87, 0x0E88],
260   [0x0E8A, 0x0E8A],
261   [0x0E8D, 0x0E8D],
262   [0x0E94, 0x0E97],
263   [0x0E99, 0x0E9F],
264   [0x0EA1, 0x0EA3],
265   [0x0EA5, 0x0EA5],
266   [0x0EA7, 0x0EA7],
267   [0x0EAA, 0x0EAB],
268   [0x0EAD, 0x0EB0],
269   [0x0EB2, 0x0EB3],
270   [0x0EBD, 0x0EBD],
271   [0x0EC0, 0x0EC4],
272   [0x0EC6, 0x0EC6],
273   [0x0EDC, 0x0EDD],
274   [0x0F00, 0x0F00],
275   [0x0F40, 0x0F47],
276   [0x0F49, 0x0F6A],
277   [0x0F88, 0x0F8B],
278   [0x1000, 0x1021],
279   [0x1023, 0x1027],
280   [0x1029, 0x102A],
281   [0x1050, 0x1055],
282   [0x10A0, 0x10C5],
283   [0x10D0, 0x10FA],
284   [0x10FC, 0x10FC],
285   [0x1100, 0x1159],
286   [0x115F, 0x11A2],
287   [0x11A8, 0x11F9],
288   [0x1200, 0x1248],
289   [0x124A, 0x124D],
290   [0x1250, 0x1256],
291   [0x1258, 0x1258],
292   [0x125A, 0x125D],
293   [0x1260, 0x1288],
294   [0x128A, 0x128D],
295   [0x1290, 0x12B0],
296   [0x12B2, 0x12B5],
297   [0x12B8, 0x12BE],
298   [0x12C0, 0x12C0],
299   [0x12C2, 0x12C5],
300   [0x12C8, 0x12D6],
301   [0x12D8, 0x1310],
302   [0x1312, 0x1315],
303   [0x1318, 0x135A],
304   [0x1380, 0x138F],
305   [0x13A0, 0x13F4],
306   [0x1401, 0x166C],
307   [0x166F, 0x1676],
308   [0x1681, 0x169A],
309   [0x16A0, 0x16EA],
310   [0x1700, 0x170C],
311   [0x170E, 0x1711],
312   [0x1720, 0x1731],
313   [0x1740, 0x1751],
314   [0x1760, 0x176C],
315   [0x176E, 0x1770],
316   [0x1780, 0x17B3],
317   [0x17D7, 0x17D7],
318   [0x17DC, 0x17DC],
319   [0x1820, 0x1877],
320   [0x1880, 0x18A8],
321   [0x1900, 0x191C],
322   [0x1950, 0x196D],
323   [0x1970, 0x1974],
324   [0x1980, 0x19A9],
325   [0x19C1, 0x19C7],
326   [0x1A00, 0x1A16],
327   [0x1B05, 0x1B33],
328   [0x1B45, 0x1B4B],
329   [0x1D00, 0x1DBF],
330   [0x1E00, 0x1E9B],
331   [0x1EA0, 0x1EF9],
332   [0x1F00, 0x1F15],
333   [0x1F18, 0x1F1D],
334   [0x1F20, 0x1F45],
335   [0x1F48, 0x1F4D],
336   [0x1F50, 0x1F57],
337   [0x1F59, 0x1F59],
338   [0x1F5B, 0x1F5B],
339   [0x1F5D, 0x1F5D],
340   [0x1F5F, 0x1F7D],
341   [0x1F80, 0x1FB4],
342   [0x1FB6, 0x1FBC],
343   [0x1FBE, 0x1FBE],
344   [0x1FC2, 0x1FC4],
345   [0x1FC6, 0x1FCC],
346   [0x1FD0, 0x1FD3],
347   [0x1FD6, 0x1FDB],
348   [0x1FE0, 0x1FEC],
349   [0x1FF2, 0x1FF4],
350   [0x1FF6, 0x1FFC],
351   [0x2071, 0x2071],
352   [0x207F, 0x207F],
353   [0x2090, 0x2094],
354   [0x2102, 0x2102],
355   [0x2107, 0x2107],
356   [0x210A, 0x2113],
357   [0x2115, 0x2115],
358   [0x2119, 0x211D],
359   [0x2124, 0x2124],
360   [0x2126, 0x2126],
361   [0x2128, 0x2128],
362   [0x212A, 0x212D],
363   [0x212F, 0x2139],
364   [0x213C, 0x213F],
365   [0x2145, 0x2149],
366   [0x214E, 0x214E],
367   [0x2183, 0x2184],
368   [0x2C00, 0x2C2E],
369   [0x2C30, 0x2C5E],
370   [0x2C60, 0x2C6C],
371   [0x2C74, 0x2C77],
372   [0x2C80, 0x2CE4],
373   [0x2D00, 0x2D25],
374   [0x2D30, 0x2D65],
375   [0x2D6F, 0x2D6F],
376   [0x2D80, 0x2D96],
377   [0x2DA0, 0x2DA6],
378   [0x2DA8, 0x2DAE],
379   [0x2DB0, 0x2DB6],
380   [0x2DB8, 0x2DBE],
381   [0x2DC0, 0x2DC6],
382   [0x2DC8, 0x2DCE],
383   [0x2DD0, 0x2DD6],
384   [0x2DD8, 0x2DDE],
385   [0x3005, 0x3006],
386   [0x3031, 0x3035],
387   [0x303B, 0x303C],
388   [0x3041, 0x3096],
389   [0x309D, 0x309F],
390   [0x30A1, 0x30FA],
391   [0x30FC, 0x30FF],
392   [0x3105, 0x312C],
393   [0x3131, 0x318E],
394   [0x31A0, 0x31B7],
395   [0x31F0, 0x31FF],
396   [0x3400, 0x4DB5],
397   [0x4E00, 0x9FBB],
398   [0xA000, 0xA48C],
399   [0xA717, 0xA71A],
400   [0xA800, 0xA801],
401   [0xA803, 0xA805],
402   [0xA807, 0xA80A],
403   [0xA80C, 0xA822],
404   [0xA840, 0xA873],
405   [0xAC00, 0xD7A3],
406   [0xF900, 0xFA2D],
407   [0xFA30, 0xFA6A],
408   [0xFA70, 0xFAD9],
409   [0xFB00, 0xFB06],
410   [0xFB13, 0xFB17],
411   [0xFB1D, 0xFB1D],
412   [0xFB1F, 0xFB28],
413   [0xFB2A, 0xFB36],
414   [0xFB38, 0xFB3C],
415   [0xFB3E, 0xFB3E],
416   [0xFB40, 0xFB41],
417   [0xFB43, 0xFB44],
418   [0xFB46, 0xFBB1],
419   [0xFBD3, 0xFD3D],
420   [0xFD50, 0xFD8F],
421   [0xFD92, 0xFDC7],
422   [0xFDF0, 0xFDFB],
423   [0xFE70, 0xFE74],
424   [0xFE76, 0xFEFC],
425   [0xFF21, 0xFF3A],
426   [0xFF41, 0xFF5A],
427   [0xFF66, 0xFFBE],
428   [0xFFC2, 0xFFC7],
429   [0xFFCA, 0xFFCF],
430   [0xFFD2, 0xFFD7],
431   [0xFFDA, 0xFFDC],
432   [0x10000, 0x1000B],
433   [0x1000D, 0x10026],
434   [0x10028, 0x1003A],
435   [0x1003C, 0x1003D],
436   [0x1003F, 0x1004D],
437   [0x10050, 0x1005D],
438   [0x10080, 0x100FA],
439   [0x10300, 0x1031E],
440   [0x10330, 0x10340],
441   [0x10342, 0x10349],
442   [0x10380, 0x1039D],
443   [0x103A0, 0x103C3],
444   [0x103C8, 0x103CF],
445   [0x10400, 0x1049D],
446   [0x10800, 0x10805],
447   [0x10808, 0x10808],
448   [0x1080A, 0x10835],
449   [0x10837, 0x10838],
450   [0x1083C, 0x1083C],
451   [0x1083F, 0x1083F],
452   [0x10900, 0x10915],
453   [0x10A00, 0x10A00],
454   [0x10A10, 0x10A13],
455   [0x10A15, 0x10A17],
456   [0x10A19, 0x10A33],
457   [0x12000, 0x1236E],
458   [0x1D400, 0x1D454],
459   [0x1D456, 0x1D49C],
460   [0x1D49E, 0x1D49F],
461   [0x1D4A2, 0x1D4A2],
462   [0x1D4A5, 0x1D4A6],
463   [0x1D4A9, 0x1D4AC],
464   [0x1D4AE, 0x1D4B9],
465   [0x1D4BB, 0x1D4BB],
466   [0x1D4BD, 0x1D4C3],
467   [0x1D4C5, 0x1D505],
468   [0x1D507, 0x1D50A],
469   [0x1D50D, 0x1D514],
470   [0x1D516, 0x1D51C],
471   [0x1D51E, 0x1D539],
472   [0x1D53B, 0x1D53E],
473   [0x1D540, 0x1D544],
474   [0x1D546, 0x1D546],
475   [0x1D54A, 0x1D550],
476   [0x1D552, 0x1D6A5],
477   [0x1D6A8, 0x1D6C0],
478   [0x1D6C2, 0x1D6DA],
479   [0x1D6DC, 0x1D6FA],
480   [0x1D6FC, 0x1D714],
481   [0x1D716, 0x1D734],
482   [0x1D736, 0x1D74E],
483   [0x1D750, 0x1D76E],
484   [0x1D770, 0x1D788],
485   [0x1D78A, 0x1D7A8],
486   [0x1D7AA, 0x1D7C2],
487   [0x1D7C4, 0x1D7CB],
488   [0x20000, 0x2A6D6],
489   [0x2F800, 0x2FA1D],
490 ];
491 
492 
493 /// Returns non-zero if u is a Unicode alpha character.
494 /// (General Unicode category: Lu, Ll, Lt, Lm and Lo)
495 ///
496 /// Standards: Unicode 5.0.0
497 int isUniAlpha(dchar u)
498 out(found)
499 {
500   debug
501   {
502   bool inTable()
503   {
504     foreach (i, range; uniAlphaTable)
505       if (range[0] <= u && u <= range[1])
506         return true;
507     return false;
508   }
509   assert(!!inTable() == !!found);
510   }
511 }
512 body
513 {
514   alias table = uniAlphaTable;
515   if (u < 0xAA && ('A' <= u && u <= 'Z' || 'a' <= u && u <= 'z'))
516     return 1; // Quick path for ASCII letters.
517   // Binary search the table:
518   size_t mid = void,
519          low = 0,
520          high = table.length;
521   static assert(table.length < size_t.max / 2, "'mid' may overflow!");
522   while (low < high)
523   {
524     mid = (low + high) / 2;
525     auto range = table[mid];
526     if (u < range[0]) // The char is below the range.
527       high = mid;
528     else if (u > range[1]) // The char is above the range.
529       low = mid + 1;
530     else // The char is inside the range.
531       return 1;
532   }
533   assert(high == low);
534   return 0;
535 }
536 
537 void testIsUniAlpha()
538 {
539   import common;
540   scope msg = new UnittestMsg("Testing function isUniAlpha().");
541 
542   // Check correctness of the ranges.
543   foreach (i, range; uniAlphaTable)
544   {
545     assert(range[0] <= range[1]);
546     if (i < uniAlphaTable.length - 1)
547       assert(range[1] < uniAlphaTable[i + 1][0]);
548   }
549 
550   // Check ASCII alphabet.
551   for (uint i; i < 0x80; i++)
552   {
553     auto isUA = isUniAlpha(i);
554     assert(('A' <= i && i <= 'Z' ||
555             'a' <= i && i <= 'z') ?
556            isUA : !isUA);
557   }
558   // Check some Unicode characters.
559   dchar[] unichars = [0x0C8C, 0x0E81, 0x0EBD, 0x0F88,
560     0x10FC, 0x17B3, 0xFB1F, 0xFE74, 0xFFDA, 0x10000, 0x103C3, 0x10837,
561     0x1D454, 0x1D4BB, 0x1D53E, 0x1D6A8, 0x1D74E, 0x1D78A, 0x1D7CB, 0x2F800];
562   foreach (u; unichars)
563     assert(isUniAlpha(u), "expected char to be a unialpha");
564 }