1 /// Author: Aziz Köksal
2 /// License: GPL3
3 /// $(Maturity very high)
4 module dil.FileBOM;
5 
6 import common;
7 
8 /// Enumeration of byte order marks.
9 enum BOM
10 {
11   None,    /// No BOM
12   UTF8,    /// UTF-8: EF BB BF
13   UTF16BE, /// UTF-16 Big Endian: FE FF
14   UTF16LE, /// UTF-16 Little Endian: FF FE
15   UTF32BE, /// UTF-32 Big Endian: 00 00 FE FF
16   UTF32LE  /// UTF-32 Little Endian: FF FE 00 00
17 }
18 
19 /// Looks at the first bytes of data and returns the corresponding BOM.
20 BOM tellBOM(const(ubyte)[] data)
21 {
22   BOM bom = BOM.None;
23 
24   if (data.length < 2)
25   { /+bom = BOM.None;+/ }
26   else if (data[0..2] == x"FE FF")
27   {
28     bom = BOM.UTF16BE; // FE FF
29   }
30   else if (data[0..2] == x"FF FE")
31   {
32     if (data.length >= 4 && data[2..4] == x"00 00")
33       bom = BOM.UTF32LE; // FF FE 00 00
34     else
35       bom = BOM.UTF16LE; // FF FE XX XX
36   }
37   else if (data[0..2] == cast(ubyte[2])x"00 00")
38   {
39     if (data.length >= 4 && data[2..4] == x"FE FF")
40       bom = BOM.UTF32BE; // 00 00 FE FF
41   }
42   else if (data[0..2] ==  x"EF BB")
43   {
44     if (data.length >= 3 && data[2] == '\xBF')
45       bom =  BOM.UTF8; // EF BB BF
46   }
47   return bom;
48 }
49 
50 void testTellBOM()
51 {
52   scope msg = new UnittestMsg("Testing function tellBOM().");
53 
54   struct Data2BOM
55   {
56     ubyte[] data;
57     BOM bom;
58   }
59   alias ub = ubyte[];
60   const Data2BOM[] map = [
61     {cast(ub)x"12",          BOM.None},
62     {cast(ub)x"12 34",       BOM.None},
63     {cast(ub)x"00 00 FF FE", BOM.None},
64     {cast(ub)x"EF BB FF",    BOM.None},
65 
66     {cast(ub)x"EF",          BOM.None},
67     {cast(ub)x"EF BB",       BOM.None},
68     {cast(ub)x"FE",          BOM.None},
69     {cast(ub)x"FF",          BOM.None},
70     {cast(ub)x"00",          BOM.None},
71     {cast(ub)x"00 00",       BOM.None},
72     {cast(ub)x"00 00 FE",    BOM.None},
73 
74     {cast(ub)x"FE FF 00",    BOM.UTF16BE},
75     {cast(ub)x"FE FF 00 FF", BOM.UTF16BE},
76 
77     {cast(ub)x"EF BB BF",    BOM.UTF8},
78     {cast(ub)x"FE FF",       BOM.UTF16BE},
79     {cast(ub)x"FF FE",       BOM.UTF16LE},
80     {cast(ub)x"00 00 FE FF", BOM.UTF32BE},
81     {cast(ub)x"FF FE 00 00", BOM.UTF32LE}
82   ];
83 
84   foreach (pair; map)
85     assert(tellBOM(pair.data) == pair.bom, Format("Failed at {0}", pair.data));
86 }