Skip to content

Commit 7d30537

Browse files
ClémentClément
authored andcommitted
Added details on dictionary.
1 parent db68fef commit 7d30537

3 files changed

Lines changed: 124 additions & 111 deletions

File tree

source/code/projects/Dictionary/Dictionary/Dictionary.cs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,18 +99,18 @@ public void Clear()
9999
// how many collisions we met so far.
100100
public int GetIndex(TKey keyP, int countP)
101101
{
102-
// countP captures the number of times we had to solve
103-
// a collision.
102+
// countP captures the number of times we had
103+
// to solve a collision.
104104
return (
105105
Math.Abs(keyP.GetHashCode())
106106
+ CollisionResolution(keyP, countP)
107107
) % table.Length;
108108
}
109109

110-
// This is the how collision are handled.
111-
// It depends on the strategy picked,
112-
// the key, and the number of time we had
113-
// to handle a collision.
110+
// This is how collisions are handled.
111+
// It depends on the strategy picked (Strategy),
112+
// the key (keyP), and the number of time we had
113+
// to handle a collision (countP).
114114
private int CollisionResolution(TKey keyP, int countP)
115115
{
116116
if (countP == 0)
@@ -153,12 +153,14 @@ public void Add(TKey keyP, TValue valueP)
153153
)
154154
{
155155
count++;
156-
if (count == table.Length) // If table is full, throw an exception.
156+
if (count == table.Length)
157157
{
158+
// If table is full, throw an exception.
158159
throw new ApplicationException("Table is full.");
159160
}
160-
else // there is still room, generate the next index.
161+
else
161162
{
163+
// There is still room, generate the next index.
162164
index = GetIndex(keyP, count);
163165
}
164166
}
@@ -178,6 +180,7 @@ public void Add(TKey keyP, TValue valueP)
178180
}
179181
// Done with adding an element
180182

183+
// We use a bool Find sub-routine
181184
public bool Find(TKey keyP)
182185
{
183186
bool found = FindI(keyP) != -1;
@@ -207,7 +210,9 @@ public int FindI(TKey keyP)
207210
if (!found) { index = -1; }
208211
return index;
209212
}
213+
// Done with found.
210214

215+
// Removing relies also on Find:
211216
public void Remove(TKey keyP)
212217
{
213218
int index = FindI(keyP);
@@ -220,4 +225,5 @@ public void Remove(TKey keyP)
220225
table[index].Status = StatusType.Deleted;
221226
}
222227
}
228+
// Done with Remove
223229
}
Lines changed: 32 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
using System;
22
using System.Collections.Generic;
33

4-
4+
// A simple, dummy class, to illustrate that
5+
// dictionaries can contain any type of
6+
// values, even complex ones.
57
public class Address
68
{
79
public string Street { get; set; }
@@ -24,13 +26,18 @@ static void Main(string[] args)
2426
string,
2527
Address
2628
>(13, CDictionary<string, Address>.PSSType.Linear);
27-
// Key of type string, value of type int.
29+
// Key of type string, value of type Address, Linear probe sequence strategy.
30+
// Try with
31+
// CDictionary<string, Address>.PSSType.Linear
32+
// CDictionary<string, Address>.PSSType.Quad
33+
// CDictionary<string, Address>.PSSType.Double
34+
35+
string[] friends = { "Bob", "Sarah", "Sam", "Justice", "Claire", "Pierre", "Mary", "Lora" };
2836

29-
string[] friends = { "Bob", "Sarah", "Justice", "Claire", "Pierre", "Mary", "Lora" };
3037
foreach (string friend in friends)
3138
{
3239
Console.WriteLine("Inserting " + friend + " (Original index: " + notebook.GetIndex(friend, 0) + ")");
33-
notebook.Add(friend, new Address("Main St."));
40+
notebook.Add(friend, new Address(notebook.GetIndex(friend, 0) + " Main St."));
3441
}
3542

3643
Console.WriteLine(notebook);
@@ -59,54 +66,29 @@ static void Main(string[] args)
5966
Console.WriteLine(ex.Message);
6067
}
6168

62-
/*
69+
Console.WriteLine("Clearing the notebook.");
70+
notebook.Clear();
71+
Console.WriteLine(notebook);
6372

64-
notebook.Add("twenty", 20);
65-
notebook.Add("fourteen", 14);
66-
notebook.Add("two", 2);
67-
notebook.Add("seventeen", 17);
68-
notebook["fifteen"] = 15;
69-
Console.Write(notebook);
70-
Console.WriteLine(notebook["two"]);
71-
notebook["two"] = 10;
72-
Console.WriteLine(notebook["two"]);
73+
string[] friends2 = {"Pierre", "Sandra", "Joy", "Nicole", "Sam", "Fritz"};
7374

74-
int x = notebook.Find("two");
75-
Console.WriteLine($"Found x = {x}");
76-
try
77-
{
78-
int y = notebook.Find("zzz");
79-
Console.WriteLine($"Found x = {y}");
80-
}
81-
catch (Exception)
82-
{
83-
Console.WriteLine($"Didn't find zzz");
84-
}
75+
foreach (string friend in friends2)
76+
{
77+
Console.WriteLine("Inserting " + friend + " (Original index: " + notebook.GetIndex(friend, 0) + ")");
78+
notebook.Add(friend, new Address(notebook.GetIndex(friend, 0) + " Main St."));
79+
}
80+
Console.WriteLine(notebook);
8581

86-
notebook.Remove("two");
87-
try
88-
{
89-
int y = notebook.Find("two");
90-
Console.WriteLine($"Should not find two = {y}");
91-
}
92-
catch (Exception)
93-
{
94-
Console.WriteLine(
95-
$"Didn't find two since it was removed"
96-
);
97-
}
98-
try
99-
{
100-
notebook.Remove("two");
101-
int y = notebook.Find("two");
102-
Console.WriteLine($"Should not find two = {y}");
103-
}
104-
catch (Exception)
105-
{
106-
Console.WriteLine(
107-
$"Shoud throw when trying to remove two since it was removed"
108-
);
109-
}
110-
*/
82+
Console.WriteLine("If the table size is not prime, we obtain:");
83+
for (int i = 0; i < 10; i++)
84+
{
85+
Console.Write((i * i) % 12 + ", ");
86+
}
87+
88+
Console.WriteLine("…\nIf the table size is prime, we obtain:");
89+
for (int i = 0; i < 10; i++)
90+
{
91+
Console.Write((i * i) % 13 + ", ");
92+
}
11193
}
11294
}

source/lectures/data/dictionary.md

Lines changed: 78 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ tags:
1010

1111
### Abstract Data Type
1212

13-
A *dictionary*, also called a *hash*, an *associative array*, a *map*, or a *hashmap*, is a key-value store: it stores values (that can be of any type) and indexes them using a key (which is in general of a simple type, such as `int`).
13+
A *dictionary*, also called a *hash*, an *associative array*, a *map*, or a *hashmap*, is a key-value store: it stores values (that can be of any type) and indexes them using a key (which is in general of a simple type, such as `int` or `string`).
1414

1515
Described [abstractly](./lectures/data/intro#abstract-data-types), [a dictionary](https://en.wikipedia.org/wiki/Hash_table) is
1616

@@ -22,18 +22,20 @@ Generally, it has operations to…
2222

2323
- … create an empty dictionary,
2424
- … test for emptiness,
25-
- … insert or update a value,
25+
- … insert or update a key-value pair,
2626
- … remove a key-value pair,
27-
- … test for existence of a key.
27+
- … test for the existence of a key.
2828

2929
And, very importantly, it uses
3030

31-
- *a hash function*, which transforms the key into an `int` (its *hash*), used as to produce an array index,
32-
- *a collision resolution strategy*, which handles when two *different* keys have been assigned the same index.
31+
- *a hash function*, which transforms a key into an `int` (its *hash*), used as to produce an array index,
32+
- *a collision resolution strategy*, which handles when two *different* keys have the same hash.
33+
34+
Note that the collision resolution strategy is useful only when two *different* keys have the same hash: a key should always get assigned the same hash, and since a key cannot be part of two different key-value pair, we should not try to resolve this conflict, but instead throw an exception.
3335

3436
### Overview
3537

36-
A dictionary organises the key-value pairs into an array by storing it in its corresponding index, computed using the hash of the key.
38+
A dictionary organizes the key-value pairs into an array by storing it in its corresponding index, computed using the hash of the key.
3739
The main benefit of this approach is that looking if a key-value pair is already in the dictionary is immediate: it suffices to hash the key, and to look at the index obtained if the same key is already stored.
3840
The main downside is that *multiple (different) keys can be assigned the same hash, and hence the same index*: indeed, since the keys that will be used is not known ahead of time, it is possible that different keys are assigned the same index. This is a *collision*, and there are two main ways of resolving it:
3941

@@ -108,78 +110,101 @@ We obtain the following, where the details of `CollisionResolution` are not impo
108110

109111
#### Adding an element
110112

111-
Adding an element is a lengthy process.
112-
We simply require a key and a value, compute an index, store it into a variable `index`, and proceed as follows:
113-
114-
As long as the table contains a `Cell` at `index` whose status is not `Deleted` nor `Empty`,
113+
Adding an element is a delicate process.
114+
We only need a key and a value, and then we
115115

116-
- Increment the counter `count` that count the number of times we saw a conflict,
117-
- Check if `count` reached the size of the array:
118-
- if that is the case, throw an exception: we saw as many collisions as there are slots in the array, meaning that the array is full,
119-
- otherwise, generate another `index`, knowing that we met `count` conflicts already.
116+
- make sure the dictionary does not already contain a key-value pair with the same key ([detailed below](#find)),
117+
- compute an index, store it into a variable `index`, and proceed as follows:
120118

121-
Once we have reached this point (i.e., once we exit the `while` loop), we know that `index` refers to a place in the array that is either empty,
122-
119+
As long as the table contains a `Cell` at `index` whose status is not `Deleted` nor `Empty`, we
123120

121+
- Increment the counter `count` that count the number of times we had to "look for a new index" (i.e., solve a conflict),
122+
- Check if `count` reached the size of the array:
123+
- if that is the case, throw an exception: we saw as many collisions as there are slots in the array, meaning that the array is full,
124+
- otherwise, generate another `index`, knowing that we met `count` conflicts already.
124125

126+
- once we have reached this point (i.e., we found a suitable `index`), we know that `index` refers to a place in the array that is either
127+
128+
- empty, in which case we can create a `Cell` object using the parameters,
129+
- with a status set to `deleted` or `empty`, and we can re-use it.
125130

126131
```{download="./code/projects/Dictionary.zip"}
127132
!include`snippetStart="// Adding an element", snippetEnd="// Done with adding an element"` code/projects/Dictionary/Dictionary/Dictionary.cs
128133
```
129134

135+
#### Finding a key {#find}
130136

131-
<!--
132-
https://en.wikibooks.org/wiki/Data_Structures/Hash_Tables#Open_addressing
137+
For `find`, we use a subroutine `FindI` that computes the index of a key if it exists, returns -1 otherwise.
138+
The critical point is to understand that we *need to keep looking even if the cell is marked as `deleted`*.
139+
We illustrate this [point below](#deleted).
140+
141+
```{download="./code/projects/Dictionary.zip"}
142+
!include`snippetStart="// We use a bool Find sub-routine", snippetEnd="// Done with found."` code/projects/Dictionary/Dictionary/Dictionary.cs
143+
```
133144

134-
https://github.com/dotnet/runtime/issues/38340
145+
#### Handling Deleting {#deleted}
135146

147+
The `Remove` method heavily relies on `FindI`:
136148

137-
/*
138-
* First, we find an empty cell (e.g. cell is null, status empty or deleted)
139-
* - We computer a possible index:
140-
* - We first use GetHashCode() to generate a hash code,
141-
* - then shift it using collisionR.
142-
* - We check if the cell at this index is available,
143-
* - If it is not, we try with the next one,
144-
* - If all cells are occupied, we throw an error.
145-
*/
149+
```{download="./code/projects/Dictionary.zip"}
150+
!include`snippetStart="// Removing relies also on Find:", snippetEnd="// Done with Remove"` code/projects/Dictionary/Dictionary/Dictionary.cs
151+
```
152+
153+
The important aspect is to understand why we use the `Deleted` status instead of simply removing the `Cell`. There is one important reason for that.
154+
Imagine the following scenario:
155+
156+
- We want to insert two key-value pairs with keys `"Mary"` and `"Lora"`. Both `GetIndex("Mary", 0)` and `GetIndex("Lora", 0)` return 6 (with an `table` of size 13).
157+
- When we insert the value with key `"Mary"`, we use the index 6.
158+
- When we insert the value with key `"Lora"`, we use `GetIndex("Lora", 1)` to generate a new index (its value will depend on the strategy that was picked, with the `Linear` method we would get 7 **if this index is available**, otherwise we would have to compute the next available index using `GetIndex("Lora", 2)`, and so on).
159+
- We delete the value with key `"Mary"`, and then look for the value with key `"Lora"`. Our `Find` algorithm computes `GetIndex("Lora", 0)`, gets 6, look at `table[6]`. If our `Remove` method simply deleted the cell containing the key with value `"Mary"`, then our `Find` algorithm would conclude that no value with key `"Lora"` is in the `table`, since the index it computed is unoccupied.
160+
161+
This is the reason why we need to keep track of the deleted cells, to make sure `Find` will keep looking because it knows that possibly, when the value with key `"Lora"` was inserted, its index was already taken.
146162

147163
#### How is the size of the array decided? {#array-size}
148164

149-
<!--
150-
/*
151-
* Why prime numbers are needed is explained for example
152-
* at
153-
* https://cs.stackexchange.com/questions/11029
154-
*/
155-
156-
Also why 31
157-
-->
165+
The size of the array will in general be a prime number. This is discussed in detail [on stackexchange](https://cs.stackexchange.com/a/64191), but can be easily illustrated.
166+
Let us assume that our dictionary
158167

168+
- uses an array of size 4 (so, *not* a prime number),
169+
- uses the quadratic probe sequence strategy,
170+
- receives one value with hash 1,
171+
- then receives two values, both with hash 0.
159172

160-
<!--
173+
The first value gets inserted at index 0, the second gets inserted at index 1. For the third value:
161174

175+
- The first index that `Insert` tries is 0, but a value is already there.
176+
- The second index that `Insert` tries is $0 + (1 \times 1) = 1$, but it is taken already.
177+
- The third index that `Insert` tries is $0 + (2 \times 2) = 4$, which gives … 0 modulo 4,
178+
- And then the sequence goes forever: 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, …
162179

163-
We then define the `Cell` class, which we will use to store the key and value. A third attribute of the enumerated datatype `StatusType`, will be used to mark if the cell is empty, active or deleted: its purpose will become clearer later on.
180+
More generally, the quadratic probe sequence gives,
164181

165-
```{download="./code/projects/Dictionary.zip"}
166-
!include`snippetStart="// the status of a cell.", snippetEnd="// A dictionary is "` code/projects/Dictionary/Dictionary/Dictionary.cs
167-
```
182+
Table size | Probe sequence
183+
--- | ---
184+
12 | 0, 1, 4, 9, 4, 1, 0, 1, 4, 9, …
185+
13 | 0, 1, 4, 9, 3, 12, 10, 10, 12, 3, …
168186

187+
While still not ideal, we can see that using a prime number for the size allows to "break the cyclicity" every now and then and to obtain additional numbers in the sequence: we go from 4 different indices to 7 per 10 indexes.
169188

189+
The default size of 31 is picked for [various reasons](https://stackoverflow.com/questions/299304/why-does-javas-hashcode-in-string-use-31-as-a-multiplier), some being historical,
170190

171-
```{download="./code/projects/Dictionary.zip"}
172-
!include code/projects/Dictionary/Dictionary/Dictionary.cs
173-
```
174191

175-
```{download="./code/projects/Dictionary.zip"}
176-
!include code/projects/Dictionary/Dictionary/PrimeHelper.cs
177-
```
192+
#### Clustering
178193

179-
```{download="./code/projects/Dictionary.zip"}
180-
!include code/projects/Dictionary/Dictionary/Program.cs
181-
```
182-
-->
194+
In general, the main goal is to avoid having parts of the array filled while other parts are left unused, a situation known as *clustering*.
195+
This situation *will* happen if too many keys are given the same hash and index, something that is hard to predict since keys will in general not be uniformly distributed and not known ahead of time.
196+
Linear probing is very bad in solving this problem, since the clusters are "spread out continuously", quadratic probing is an improvement, but only partially solve this issue, since keys with identical hashes will still follow teh same sequence.
197+
Double hashing is a bit better at solving this problem, since keys with identical hashes may drift apart significantly when the secondary hash function is applied.
183198

184-
x
199+
This general discussion relates to performance and requires to measure the dictionary's load factor, which is the number of entries occupied in the hash table divided by the table length (or number of "buckets").
200+
Of course, open-addressed hash table cannot have a load factor greater than 1, but other techniques, such as chaining, allows for larger load factors.
201+
202+
<!--
203+
Double hashing, in which the interval between probes is computed by a secondary hash function
185204
205+
For open addressing schemes, the hash function should also avoid clustering, the mapping of two or more keys to consecutive slots. Such clustering may cause the lookup cost to skyrocket, even if the load factor is low and collisions are infrequent. The popular multiplicative hash is claimed to have particularly poor clustering behavior.[22][4]
206+
207+
https://www.javamex.com/tutorials/collections/hash_function_technical_2.shtml
208+
209+
cf. "Implementing Double Hashing" at <https://pressbooks.palni.org/anopenguidetodatastructuresandalgorithms/chapter/hashing-and-hash-tables/>
210+
-->

0 commit comments

Comments
 (0)