Skip to content

Commit 0a03ecb

Browse files
ClémentClément
authored andcommitted
Working on priority queue and improving Dictionary notes
1 parent b699401 commit 0a03ecb

7 files changed

Lines changed: 357 additions & 16 deletions

File tree

source/code/projects/Dictionary/Dictionary/Dictionary.cs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,10 @@ public int GetIndex(TKey keyP, int countP)
105105
Math.Abs(keyP.GetHashCode())
106106
+ CollisionResolution(keyP, countP)
107107
) % table.Length;
108+
108109
}
110+
// GetIndex is public for demonstration purposes,
111+
// but should really be private.
109112

110113
// This is how collisions are handled.
111114
// It depends on the strategy picked (Strategy),
@@ -123,16 +126,27 @@ private int CollisionResolution(TKey keyP, int countP)
123126
return countP * countP;
124127
else if (Strategy == PSSType.Double)
125128
// This is double hashing.
126-
return countP * (31 - (keyP.GetHashCode() % 31));
127-
// countP * hash2(keyP) where hash2 is 31 - (key % 31) and will always be > 0
129+
return countP * GetHash2(keyP);
128130
else
131+
// This is needed to compile:
132+
// even if we know that those are
133+
// the only values in the PSSType
134+
// enumerated datatype, C# will
135+
// complain that not all code path
136+
// return a value otherwise.
129137
throw new ApplicationException(
130138
"Unknown collision startegy."
131139
);
132140
}
133141
}
134142
// Done with GetIndex and CollisionResolution.
135143

144+
// Secondary hash function
145+
private int GetHash2(TKey key)
146+
{
147+
return table.Length - (key.GetHashCode() % table.Length);
148+
}
149+
136150
// Adding an element
137151
public void Add(TKey keyP, TValue valueP)
138152
{

source/code/projects/Dictionary/Dictionary/Program.cs

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ static void Main(string[] args)
2525
CDictionary<string, Address> notebook = new CDictionary<
2626
string,
2727
Address
28-
>(13, CDictionary<string, Address>.PSSType.Linear);
28+
>(13, CDictionary<string, Address>.PSSType.Quad);
2929
// Key of type string, value of type Address, Linear probe sequence strategy.
3030
// Try with
3131
// CDictionary<string, Address>.PSSType.Linear
@@ -90,5 +90,49 @@ static void Main(string[] args)
9090
{
9191
Console.Write((i * i) % 13 + ", ");
9292
}
93+
Console.WriteLine("…");
94+
95+
Console.WriteLine("\nFinally, let us observe the index computed using the quadratic strategy:");
96+
97+
CDictionary<string, string> demoQ = new CDictionary<
98+
string,
99+
string
100+
>(1009, CDictionary<string, string>.PSSType.Quad);
101+
bool[] arrayQ = new bool[1009];
102+
for (int i = 0; i < arrayQ.Length; i++)
103+
{
104+
arrayQ[demoQ.GetIndex("Test", i)] = true;
105+
// Uncomment the following if you'd like to see
106+
// which indices are hit.
107+
// Console.WriteLine(i +": " + demoQ.GetIndex("Test", i) + ".");
108+
}
109+
int count = 0;
110+
for (int i = 0; i < arrayQ.Length; i++)
111+
{
112+
if (arrayQ[i]) count++;
113+
}
114+
Console.WriteLine($"We hit {((decimal)count / arrayQ.Length):p} of the indices.");
115+
116+
Console.WriteLine("\nFinally, let us observe the index computed using the double hash strategy:");
117+
// Demonstrating the double hash strategy:
118+
CDictionary<string, string> demoD = new CDictionary<
119+
string,
120+
string
121+
>(1009, CDictionary<string, string>.PSSType.Double);
122+
bool[] arrayD = new bool[1009];
123+
for (int i = 0; i < arrayD.Length; i++)
124+
{
125+
arrayD[demoD.GetIndex("Test", i)] = true;
126+
// Uncomment the following if you'd like to see
127+
// which indices are hit.
128+
// Console.WriteLine(i +": " + demoD.GetIndex("Test", i) + ".");
129+
}
130+
count = 0;
131+
for (int i = 0; i < arrayD.Length; i++)
132+
{
133+
if (arrayD[i]) count++;
134+
}
135+
Console.WriteLine($"We hit {((decimal)count / arrayD.Length):p} of the indices.");
136+
// 100% !
93137
}
94138
}
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
using System; // This is required for the exception.
2+
3+
class PQueue<TPriority, TValue> where TPriority : IComparable<TPriority>
4+
{
5+
class Cell
6+
{
7+
public TPriority Priority { get; set; }
8+
public TValue Value { get; set; }
9+
public Cell(TPriority priorityP, TValue valueP)
10+
{
11+
Priority = priorityP;
12+
Value = valueP;
13+
}
14+
public override string ToString()
15+
{
16+
return Value + " (priority: " + Priority + ")";
17+
}
18+
}
19+
private Cell[] mArray;
20+
public PQueue(int sizeP = 10)
21+
{
22+
mArray = new Cell[sizeP];
23+
}
24+
public void Add(TPriority priorityP, TValue valueP)
25+
{
26+
// slot is the index where we will add the element
27+
int slot = -1;
28+
// index is where we are currently looking for
29+
// a slot in the arry.
30+
int index = 0;
31+
while (index < mArray.Length && slot == -1)
32+
{
33+
if (mArray[index] == null)
34+
{
35+
slot = index;
36+
}
37+
else
38+
{
39+
index++;
40+
}
41+
}
42+
if (slot == -1)
43+
{
44+
throw new ApplicationException("Could not add the element.");
45+
}
46+
else
47+
{
48+
mArray[slot] = new Cell(priorityP, valueP);
49+
}
50+
}
51+
52+
public int MinPriority()
53+
{
54+
int index = 0;
55+
// We begin by looking for a value
56+
// in mArray that is not null.
57+
bool notNull = false;
58+
while (index < mArray.Length && !notNull)
59+
{
60+
if (mArray[index] != null)
61+
{
62+
// We found a value that is not null.
63+
notNull = true;
64+
}
65+
else
66+
{
67+
index++;
68+
}
69+
}
70+
// If we exit and notNull is still false,
71+
// it means there is no non-null cell in
72+
// the array.
73+
if (!notNull)
74+
{
75+
throw new ApplicationException("Queue is empty, no index with minimal priority.");
76+
}
77+
// Minimal priority found "so far".
78+
TPriority minP = mArray[index].Priority;
79+
// Index of the minimal priority found "so far".
80+
int minI = index;
81+
while (index < mArray.Length)
82+
{
83+
// The following if is crucial: there may
84+
// be null values in the array, and we should
85+
// not try to access the Priority property
86+
// if mArray[index] is null.
87+
if (mArray[index] != null)
88+
{
89+
// If we found a lower priority,
90+
// we update the minP and minI
91+
// values.
92+
if (mArray[index].Priority.CompareTo(minP) < 0)
93+
{
94+
minP = mArray[index].Priority;
95+
minI = index;
96+
}
97+
}
98+
index++;
99+
}
100+
return minI;
101+
}
102+
103+
public string Peek()
104+
{
105+
// Looking at the most urgent Cell
106+
// uses MinPriority.
107+
return mArray[MinPriority()].ToString();
108+
}
109+
110+
public string Extract()
111+
{
112+
// Removing the most urgent Cell
113+
// relies also on MinPriority().
114+
int minE = MinPriority();
115+
Cell cellE = mArray[minE];
116+
mArray[minE] = null;
117+
return cellE.ToString();
118+
}
119+
120+
public override string ToString()
121+
{
122+
string ret = "";
123+
for (int i = 0; i < mArray.Length; i++)
124+
{
125+
if (mArray[i] != null)
126+
{
127+
ret += mArray[i].ToString();
128+
}
129+
else
130+
{
131+
ret += "(empty)";
132+
}
133+
ret += "\n";
134+
}
135+
return ret;
136+
}
137+
}
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
using System;
2+
3+
class Program
4+
{
5+
static void Main(string[] args)
6+
{
7+
PQueue<int, string> myQueue = new PQueue<int, string>(5);
8+
try
9+
{
10+
myQueue.MinPriority();
11+
}
12+
catch(Exception ex)
13+
{
14+
Console.WriteLine(ex.Message);
15+
}
16+
17+
/*
18+
* https://en.wikipedia.org/wiki/Emergency_Severity_Index
19+
* Level 1: Resuscitation
20+
* Level 2: Emergent
21+
* Level 3: Urgent
22+
* Level 4: Semi-Urgent
23+
* Level 5: Non-urgent
24+
*/
25+
myQueue.Add(1, "Cardiac arrest");
26+
Console.WriteLine(myQueue);
27+
int minP = myQueue.MinPriority();
28+
Console.Write("Most urgent priority is at index " + minP + " with " + myQueue.Peek() + ".\n");
29+
myQueue.Add(3, "High fever with cough");
30+
myQueue.Add(2, "Asthma attack");
31+
myQueue.Add(5, "Prescription refill ");
32+
myQueue.Add(4, "Simple laceration");
33+
Console.WriteLine(myQueue);
34+
35+
try
36+
{
37+
myQueue.Add(3, "Abdominal pain");
38+
}
39+
catch (Exception ex)
40+
{
41+
Console.WriteLine(ex.Message);
42+
}
43+
44+
Console.Write("Removing most urgent: " + myQueue.Extract() + ".\n");
45+
Console.Write("Removing most urgent: " + myQueue.Extract() + ".\n");
46+
47+
Console.WriteLine(myQueue);
48+
myQueue.Add(5, "Suture removal");
49+
Console.Write("Removing most urgent: " + myQueue.Extract() + ".\n");
50+
Console.Write("Removing most urgent: " + myQueue.Extract() + ".\n");
51+
Console.WriteLine(myQueue);
52+
53+
Console.Write("Removing most urgent: " + myQueue.Extract() + ".\n");
54+
Console.Write("Removing most urgent: " + myQueue.Extract() + ".\n");
55+
try
56+
{
57+
Console.Write("Removing most urgent: " + myQueue.Extract() + ".\n");
58+
}catch (Exception ex)
59+
{
60+
Console.WriteLine(ex.Message);
61+
}
62+
}
63+
}

source/lectures/data/dictionary.md

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ We illustrate this [point below](#handling-deletion).
142142
!include`snippetStart="// We use a bool Find sub-routine", snippetEnd="// Done with found."` code/projects/Dictionary/Dictionary/Dictionary.cs
143143
```
144144

145-
#### Handling Deletion
145+
#### Handling deletion
146146

147147
The `Remove` method heavily relies on `FindI`:
148148

@@ -186,25 +186,35 @@ Table size | Probe sequence
186186

187187
While still not ideal, we can see that using a prime number for the size allows to "break the cyclicity" every now and then and to obtain additional numbers in the sequence: we go from 4 different indices to 7 per 10 indexes.
188188

189-
The default size of 31 is picked for [various reasons](https://stackoverflow.com/questions/299304/why-does-javas-hashcode-in-string-use-31-as-a-multiplier), some being historical,
189+
The default size of 31 is picked for [various reasons](https://stackoverflow.com/questions/299304/why-does-javas-hashcode-in-string-use-31-as-a-multiplier), some being historical.
190190

191+
As we can see, the quadratic probing strategy has an issue that linear probing does not have: it may "skip" some indices, and incorrectly returns that the array is full while it is not.
192+
Why would we chose it, then? Because of clustering.
191193

192194
#### Clustering
193195

194-
In general, the main goal is to avoid having parts of the array filled while other parts are left unused, a situation known as *clustering*.
196+
An important goal of dictionaries is to avoid having parts of the array filled while other parts are left unused, a situation known as *clustering*.
197+
This is detrimental, because finding an index requires more and more computation if keys are often given the same or close indices (i.e., we need to call `GetIndex` with higher `countP` values).
198+
195199
This situation *will* happen if too many keys are given the same hash and index, something that is hard to predict since keys will in general not be uniformly distributed and not known ahead of time.
196-
Linear probing is very bad in solving this problem, since the clusters are "spread out continuously", quadratic probing is an improvement, but only partially solve this issue, since keys with identical hashes will still follow teh same sequence.
197-
Double hashing is a bit better at solving this problem, since keys with identical hashes may drift apart significantly when the secondary hash function is applied.
200+
Linear probing is very bad in solving this problem, since the clusters are "spread out continuously", quadratic probing is an improvement, but only partially solve this issue, since keys with identical hashes will still follow the same sequence.
201+
[Double hashing](https://en.wikipedia.org/wiki/Double_hashing) is a bit better at solving this problem, since keys with identical hashes may drift apart significantly when the secondary hash function is applied:
198202

199-
This general discussion relates to performance and requires to measure the dictionary's load factor, which is the number of entries occupied in the hash table divided by the table length (or number of "buckets").
200-
Of course, open-addressed hash table cannot have a load factor greater than 1, but other techniques, such as chaining, allows for larger load factors.
201203

202-
<!--
203-
Double hashing, in which the interval between probes is computed by a secondary hash function
204+
```{download="./code/projects/Dictionary.zip"}
205+
!include`snippetStart="// Secondary hash function", snippetEnd="// Adding an element"` code/projects/Dictionary/Dictionary/Dictionary.cs
206+
```
204207

205-
For open addressing schemes, the hash function should also avoid clustering, the mapping of two or more keys to consecutive slots. Such clustering may cause the lookup cost to skyrocket, even if the load factor is low and collisions are infrequent. The popular multiplicative hash is claimed to have particularly poor clustering behavior.[22][4]
208+
A second Hash function **must never evaluate to zero** (otherwise we are just trying the same spot again and again), be as independent from the first hash function as possible, and should help in trying as many slots as possible.
209+
Note that our function never evaluate to zero, since `key.GetHashCode() % table.Length` gives a value between 0 and `table.Length`-1, so `table.Length - (key.GetHashCode() % table.Length)` gives a value between 1 and `table.Length`.
206210

207-
https://www.javamex.com/tutorials/collections/hash_function_technical_2.shtml
211+
Our `main` method includes a test demonstrating the efficiency of our double hashing techniques:
208212

209-
cf. "Implementing Double Hashing" at <https://pressbooks.palni.org/anopenguidetodatastructuresandalgorithms/chapter/hashing-and-hash-tables/>
210-
-->
213+
```{download="./code/projects/Dictionary.zip"}
214+
!include`snippetStart="// Demonstrating the double hash strategy:", snippetEnd="// 100% !"` code/projects/Dictionary/Dictionary/Dictionary.cs
215+
```
216+
217+
While the quadratic method hits about 50% of the indices, the double hashing techniques reach 100%!
218+
219+
This general discussion relates to performance and requires to measure the dictionary's load factor, which is the number of entries occupied in the hash table divided by the table length (or number of "buckets").
220+
Of course, open-addressed hash table cannot have a load factor greater than 1, but other techniques, such as chaining, allows for larger load factors.

0 commit comments

Comments
 (0)