1 /// Author: Aziz Köksal
2 /// License: GPL3
3 /// $(Maturity low)
4 module dil.StringSet;
5 
6 import dil.Array;
7 import dil.String;
8 import common;
9 
10 import std.c..string;
11 
12 /// Binary tree implementation of a hash set with strings as elements.
13 struct StringSet
14 {
15   struct Node
16   {
17     size_t left;  /// Index of left branch.
18     size_t right; /// Index of right branch.
19     hash_t hash;  /// Hash value.
20     cbinstr str;  /// Binary string.
21     cstring repr()
22     {
23       return Format("Node(left={},right={},hash={},str={})",
24         left, right, hash, str.length > 4 ? str[0..4] : str);
25     }
26     string toString()
27     {
28       return cast(string)repr();
29     }
30   }
31 
32   size_t[] list; /// List of 1-based indices to Nodes.
33   DArray!Node nodes; /// Instances of Node.
34 
35   /// Constructs a StringSet with the desired properties.
36   this(size_t nrOfNodes = 0, size_t size = 0)
37   {
38     nodes.cap = nrOfNodes;
39     resize(size);
40   }
41 
42   /// Allocates and returns the 1-based index of a new uninitialized Node.
43   size_t newNode()
44   {
45     if (nodes.cap == 0)
46       nodes.cap = 1;
47     else if (nodes.rem == 0)
48       nodes.growX1_5();
49     //*nodes.cur = Node.init;
50     nodes.cur++;
51     return nodes.len;
52   }
53 
54   /// Returns the Node for the 1-based index.
55   Node* getNode(size_t nindex)
56   {
57     assert(nindex);
58     return nodes.ptr + nindex - 1;
59   }
60 
61   /// Resizes the bucket list and reassigns the nodes.
62   void resize(size_t size)
63   {
64     list = new typeof(list[0])[size];
65     foreach (i, ref n; nodes[])
66     {
67       n.left = n.right = 0;
68       auto bindex = n.hash % list.length;
69       auto pindex = &list[bindex];
70       auto hash = n.hash;
71       auto str = n.str;
72       while (*pindex)
73       {
74         auto node = getNode(*pindex);
75         int_t diff;
76         if ((diff = node.hash - hash) == 0 &&
77             (diff = node.str.length - str.length) == 0 &&
78             (diff = memcmp(node.str.ptr, str.ptr, str.length)) == 0)
79           assert(0, "can't have two equal nodes in the same set");
80         // Go left or right down the tree.
81         pindex = diff < 0 ? &node.left : &node.right;
82       }
83       *pindex = i + 1;
84     }
85   }
86 
87   /// Finds the Node holding str, or a slot where a new Node can be saved.
88   size_t* find(cbinstr str, hash_t hash)
89   {
90     if (!list.length)
91       resize(32);
92     auto pindex = &list[hash % list.length];
93     while (*pindex)
94     {
95       auto node = getNode(*pindex);
96       int_t diff;
97       if ((diff = node.hash - hash) == 0 &&
98           (diff = node.str.length - str.length) == 0 &&
99           (diff = memcmp(node.str.ptr, str.ptr, str.length)) == 0)
100         break; // If equal in hash, length and content.
101       // Go left or right down the tree.
102       pindex = diff < 0 ? &node.left : &node.right;
103     }
104     return pindex;
105   }
106 
107   /// ditto
108   size_t* find(cbinstr str)
109   {
110     return find(str, hashOf(cast(cstring)str));
111   }
112 
113   /// Returns a reference to str if it exists.
114   const(cbinstr)* get(cbinstr str)
115   {
116     auto p = find(str);
117     return *p ? &getNode(*p).str : null;
118   }
119 
120   /// Returns true if str is contained in the set.
121   bool has(cbinstr str)
122   {
123     return list.length && !!*find(str);
124   }
125 
126   /// Returns an existing entry or creates a new one.
127   const(cbinstr)* add(cbinstr str)
128   {
129     auto hash = hashOf(cast(cstring)str);
130     auto p = find(str, hash);
131     Node* node;
132     if (*p)
133       node = getNode(*p);
134     else // Empty spot.
135     {
136       node = getNode(*p = newNode());
137       *node = Node(0, 0, hash, str);
138     }
139     return &node.str;
140   }
141 
142   /// Returns a string representation.
143   cstring repr()
144   {
145     return Format("StringSet(list={}, nodes={})", list, nodes[]);
146   }
147 }
148 
149 inout(ubyte)[] tobin(inout(char)[] str)
150 {
151   return cast(inout(ubyte)[]) str;
152 }
153 
154 void testStringSet()
155 {
156   scope msg = new UnittestMsg("Testing struct StringSet.");
157   StringSet s;
158   auto pstr = s.add("abcd".tobin);
159   assert(pstr is s.get("abcd".tobin));
160   assert(s.has("abcd".tobin));
161   s.resize(64);
162   assert(s.has("abcd".tobin));
163 }