Added details on dictionary.

Clément · Clément · commit 7d3053776e92 · 2025-10-20T20:31:11.000-04:00
diff --git a/source/code/projects/Dictionary/Dictionary/Dictionary.cs b/source/code/projects/Dictionary/Dictionary/Dictionary.cs
@@ -99,18 +99,18 @@ public void Clear()
     // how many collisions we met so far.
     public int GetIndex(TKey keyP, int countP)
     {
-        // countP captures the number of times we had to solve
-        // a collision.
+        // countP captures the number of times we had 
+        // to solve a collision.
         return (
             Math.Abs(keyP.GetHashCode())
             + CollisionResolution(keyP, countP)
           ) % table.Length;
     }
 
-    // This is the how collision are handled.
-    // It depends on the strategy picked,
-    // the key, and the number of time we had
-    // to handle a collision.
+    // This is how collisions are handled.
+    // It depends on the strategy picked (Strategy),
+    // the key (keyP), and the number of time we had
+    // to handle a collision (countP).
     private int CollisionResolution(TKey keyP, int countP)
     {
         if (countP == 0)
@@ -153,12 +153,14 @@ public void Add(TKey keyP, TValue valueP)
         )
         {
             count++;
-            if (count == table.Length) // If table is full, throw an exception.
+            if (count == table.Length)
             {
+                // If table is full, throw an exception.
                 throw new ApplicationException("Table is full.");
             }
-            else // there is still room, generate the next index.
+            else
             {
+                // There is still room, generate the next index.
                 index = GetIndex(keyP, count);
             }
         }
@@ -178,6 +180,7 @@ public void Add(TKey keyP, TValue valueP)
     }
     // Done with adding an element
 
+        // We use a bool Find sub-routine
     public bool Find(TKey keyP)
     {
         bool found = FindI(keyP) != -1;
@@ -207,7 +210,9 @@ public int FindI(TKey keyP)
         if (!found) { index = -1; }
         return index;
     }
+    // Done with found.
 
+        // Removing relies also on Find:
     public void Remove(TKey keyP)
     {
         int index = FindI(keyP);
@@ -220,4 +225,5 @@ public void Remove(TKey keyP)
             table[index].Status = StatusType.Deleted;
         }
     }
+    // Done with Remove
 }
diff --git a/source/code/projects/Dictionary/Dictionary/Program.cs b/source/code/projects/Dictionary/Dictionary/Program.cs
@@ -1,7 +1,9 @@
 ﻿using System;
 using System.Collections.Generic;
 
-
+// A simple, dummy class, to illustrate that 
+// dictionaries can contain any type of 
+// values, even complex ones.
 public class Address
 {
     public string Street { get; set; }
@@ -24,13 +26,18 @@ static void Main(string[] args)
           string,
           Address
         >(13, CDictionary<string, Address>.PSSType.Linear);
-        // Key of type string, value of type int.
+        // Key of type string, value of type Address, Linear probe sequence strategy.
+        // Try with
+        // CDictionary<string, Address>.PSSType.Linear
+        // CDictionary<string, Address>.PSSType.Quad 
+        // CDictionary<string, Address>.PSSType.Double
+
+        string[] friends = { "Bob", "Sarah", "Sam", "Justice", "Claire", "Pierre", "Mary", "Lora" };
 
-        string[] friends = { "Bob", "Sarah", "Justice", "Claire", "Pierre", "Mary", "Lora" };
         foreach (string friend in friends)
         {
             Console.WriteLine("Inserting " + friend + " (Original index: " + notebook.GetIndex(friend, 0) + ")");
-            notebook.Add(friend, new Address("Main St."));
+            notebook.Add(friend, new Address(notebook.GetIndex(friend, 0) + " Main St."));
         }
 
         Console.WriteLine(notebook);
@@ -59,54 +66,29 @@ static void Main(string[] args)
             Console.WriteLine(ex.Message);
         }
 
-        /*
+        Console.WriteLine("Clearing the notebook.");
+        notebook.Clear();
+        Console.WriteLine(notebook);
 
-        notebook.Add("twenty", 20);
-    notebook.Add("fourteen", 14);
-    notebook.Add("two", 2);
-    notebook.Add("seventeen", 17);
-    notebook["fifteen"] = 15;
-    Console.Write(notebook);
-    Console.WriteLine(notebook["two"]);
-    notebook["two"] = 10;
-    Console.WriteLine(notebook["two"]);
+        string[] friends2 = {"Pierre", "Sandra", "Joy", "Nicole", "Sam", "Fritz"}; 
 
-    int x = notebook.Find("two");
-    Console.WriteLine($"Found x = {x}");
-    try
-    {
-      int y = notebook.Find("zzz");
-      Console.WriteLine($"Found x = {y}");
-    }
-    catch (Exception)
-    {
-      Console.WriteLine($"Didn't find zzz");
-    }
+    foreach (string friend in friends2)
+        {
+            Console.WriteLine("Inserting " + friend + " (Original index: " + notebook.GetIndex(friend, 0) + ")");
+            notebook.Add(friend, new Address(notebook.GetIndex(friend, 0) + " Main St."));
+        }
+        Console.WriteLine(notebook);
 
-    notebook.Remove("two");
-    try
-    {
-      int y = notebook.Find("two");
-      Console.WriteLine($"Should not find two = {y}");
-    }
-    catch (Exception)
-    {
-      Console.WriteLine(
-        $"Didn't find two since it was removed"
-      );
-    }
-    try
-    {
-      notebook.Remove("two");
-      int y = notebook.Find("two");
-      Console.WriteLine($"Should not find two = {y}");
-    }
-    catch (Exception)
-    {
-      Console.WriteLine(
-        $"Shoud throw when trying to remove two since it was removed"
-      );
-    }
-    */
+        Console.WriteLine("If the table size is not prime, we obtain:");
+        for (int i = 0; i < 10; i++)
+        {
+            Console.Write((i * i) % 12 + ", ");
+        }
+
+        Console.WriteLine("…\nIf the table size is prime, we obtain:");
+        for (int i = 0; i < 10; i++)
+        {
+            Console.Write((i * i) % 13 + ", ");
+        }
     }
 }
diff --git a/source/lectures/data/dictionary.md b/source/lectures/data/dictionary.md
@@ -10,7 +10,7 @@ tags:
 
 ### Abstract Data Type
 
-A *dictionary*, also called a *hash*, an *associative array*, a *map*, or a *hashmap*, is a key-value store: it stores values (that can be of any type) and indexes them using a key (which is in general of a simple type, such as `int`).
+A *dictionary*, also called a *hash*, an *associative array*, a *map*, or a *hashmap*, is a key-value store: it stores values (that can be of any type) and indexes them using a key (which is in general of a simple type, such as `int` or `string`).
 
 Described [abstractly](./lectures/data/intro#abstract-data-types), [a dictionary](https://en.wikipedia.org/wiki/Hash_table) is 
 
@@ -22,18 +22,20 @@ Generally, it has operations to…
 
 - … create an empty dictionary,
 - … test for emptiness, 
-- … insert or update a value,
+- … insert or update a key-value pair,
 - … remove a key-value pair,
-- … test for existence of a key.
+- … test for the existence of a key.
 
 And, very importantly, it uses
 
-- *a hash function*, which transforms the key into an `int` (its *hash*), used as to produce an array index, 
-- *a collision resolution strategy*, which handles when two *different* keys have been assigned the same index.
+- *a hash function*, which transforms a key into an `int` (its *hash*), used as to produce an array index, 
+- *a collision resolution strategy*, which handles when two *different* keys have the same hash.
+
+Note that the collision resolution strategy is useful only when two *different* keys have the same hash: a key should always get assigned the same hash, and since a key cannot be part of two different key-value pair, we should not try to resolve this conflict, but instead throw an exception.
 
 ### Overview
 
-A dictionary organises the key-value pairs into an array by storing it in its corresponding index, computed using the hash of the key.
+A dictionary organizes the key-value pairs into an array by storing it in its corresponding index, computed using the hash of the key.
 The main benefit of this approach is that looking if a key-value pair is already in the dictionary is immediate: it suffices to hash the key, and to look at the index obtained if the same key is already stored.
 The main downside is that *multiple (different) keys can be assigned the same hash, and hence the same index*: indeed, since the keys that will be used is not known ahead of time, it is possible that different keys are assigned the same index. This is a *collision*, and there are two main ways of resolving it:
 
@@ -108,78 +110,101 @@ We obtain the following, where the details of `CollisionResolution` are not impo
 
 #### Adding an element
 
-Adding an element is a lengthy process.
-We simply require a key and a value, compute an index, store it into a variable `index`, and proceed as follows:
-
-As long as the table contains a `Cell` at `index` whose status is not `Deleted` nor `Empty`,
+Adding an element is a delicate process.
+We only need a key and a value, and then we 
 
-- Increment the counter `count` that count the number of times we saw a conflict,
-- Check if `count` reached the size of the array:
-    - if that is the case, throw an exception: we saw as many collisions as there are slots in the array, meaning that the array is full,
-    - otherwise, generate another `index`, knowing that we met `count` conflicts already.
+- make sure the dictionary does not already contain a key-value pair with the same key ([detailed below](#find)),
+- compute an index, store it into a variable `index`, and proceed as follows:
 
-Once we have reached this point (i.e., once we exit the `while` loop), we know that `index` refers to a place in the array that is either empty, 
-        
+    As long as the table contains a `Cell` at `index` whose status is not `Deleted` nor `Empty`, we
 
+    - Increment the counter `count` that count the number of times we had to "look for a new index" (i.e., solve a conflict),
+    - Check if `count` reached the size of the array:
+        - if that is the case, throw an exception: we saw as many collisions as there are slots in the array, meaning that the array is full,
+        - otherwise, generate another `index`, knowing that we met `count` conflicts already.
 
+- once we have reached this point (i.e., we found a suitable `index`), we know that `index` refers to a place in the array that is either 
+    
+    - empty, in which case we can create a `Cell` object using the parameters,
+    - with a status set to `deleted` or `empty`, and we can re-use it.
 
 ```{download="./code/projects/Dictionary.zip"}
 !include`snippetStart="// Adding an element", snippetEnd="// Done with adding an element"` code/projects/Dictionary/Dictionary/Dictionary.cs
 ```
 
+#### Finding a key {#find}
 
-<!--
-https://en.wikibooks.org/wiki/Data_Structures/Hash_Tables#Open_addressing
+For `find`, we use a subroutine `FindI` that computes the index of a key if it exists, returns -1 otherwise.
+The critical point is to understand that we *need to keep looking even if the cell is marked as `deleted`*.
+We illustrate this [point below](#deleted).
+
+```{download="./code/projects/Dictionary.zip"}
+!include`snippetStart="// We use a bool Find sub-routine", snippetEnd="// Done with found."` code/projects/Dictionary/Dictionary/Dictionary.cs
+```
 
-https://github.com/dotnet/runtime/issues/38340
+#### Handling Deleting {#deleted}
 
+The `Remove` method heavily relies on `FindI`:
 
-    /*
-     * First, we find an empty cell (e.g. cell is null, status empty or deleted)
-     * - We computer a possible index:
-     *      - We first use GetHashCode() to generate a hash code,
-     *      - then shift it using collisionR.
-     * - We check if the cell at this index is available,
-     * - If it is not, we try with the next one,
-     * - If all cells are occupied, we throw an error.
-     */
+```{download="./code/projects/Dictionary.zip"}
+!include`snippetStart="// Removing relies also on Find:", snippetEnd="// Done with Remove"` code/projects/Dictionary/Dictionary/Dictionary.cs
+```
+
+The important aspect is to understand why we use the `Deleted` status instead of simply removing the `Cell`. There is one important reason for that.
+Imagine the following scenario:
+
+- We want to insert two key-value pairs with keys `"Mary"` and `"Lora"`. Both `GetIndex("Mary", 0)` and `GetIndex("Lora", 0)` return 6 (with an `table` of size 13).
+- When we insert the value with key `"Mary"`, we use the index 6.
+- When we insert the value with key `"Lora"`, we use `GetIndex("Lora", 1)` to generate a new index (its value will depend on the strategy that was picked, with the `Linear` method we would get 7 **if this index is available**, otherwise we would have to compute the next available index using `GetIndex("Lora", 2)`, and so on).
+- We delete the value with key `"Mary"`, and then look for the value with key `"Lora"`. Our `Find` algorithm computes `GetIndex("Lora", 0)`, gets 6, look at `table[6]`. If our `Remove` method simply deleted the cell containing the key with value `"Mary"`, then our `Find` algorithm would conclude that no value with key `"Lora"` is in the `table`, since the index it computed is unoccupied.
+
+This is the reason why we need to keep track of the deleted cells, to make sure `Find` will keep looking because it knows that possibly, when the value with key `"Lora"` was inserted, its index was already taken.
 
 #### How is the size of the array decided? {#array-size}
 
-<!--
-/*
- * Why prime numbers are needed is explained for example
- * at
- * https://cs.stackexchange.com/questions/11029
- */
- 
- Also why 31
--->
+The size of the array will in general be a prime number. This is discussed in detail [on stackexchange](https://cs.stackexchange.com/a/64191), but can be easily illustrated.
+Let us assume that our dictionary
 
+- uses an array of size 4 (so, *not* a prime number),
+- uses the quadratic probe sequence strategy,
+- receives one value with hash 1,
+- then receives two values, both with hash 0.
 
-<!--
+The first value gets inserted at index 0, the second gets inserted at index 1. For the third value:
 
+- The first index that `Insert` tries is 0, but a value is already there.
+- The second index that `Insert` tries is $0 + (1 \times 1) = 1$, but it is taken already.
+- The third index that `Insert` tries is $0 + (2 \times 2) = 4$, which gives … 0 modulo 4, 
+- And then the sequence goes forever: 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, …
 
-We then define the `Cell` class, which we will use to store the key and value. A third attribute of the enumerated datatype `StatusType`, will be used to mark if the cell is empty, active or deleted: its purpose will become clearer later on.
+More generally, the quadratic probe sequence gives,
 
-```{download="./code/projects/Dictionary.zip"}
-!include`snippetStart="// the status of a cell.", snippetEnd="// A dictionary is "` code/projects/Dictionary/Dictionary/Dictionary.cs
-```
+Table size | Probe sequence
+--- | --- 
+12 | 0, 1, 4, 9, 4, 1, 0, 1, 4, 9, …
+13 | 0, 1, 4, 9, 3, 12, 10, 10, 12, 3, …
 
+While still not ideal, we can see that using a prime number for the size allows to "break the cyclicity" every now and then and to obtain additional numbers in the sequence: we go from 4 different indices to 7 per 10 indexes.
 
+The default size of 31 is picked for [various reasons](https://stackoverflow.com/questions/299304/why-does-javas-hashcode-in-string-use-31-as-a-multiplier), some being historical, 
 
-```{download="./code/projects/Dictionary.zip"}
-!include code/projects/Dictionary/Dictionary/Dictionary.cs
-```
 
-```{download="./code/projects/Dictionary.zip"}
-!include code/projects/Dictionary/Dictionary/PrimeHelper.cs
-```
+#### Clustering
 
-```{download="./code/projects/Dictionary.zip"}
-!include code/projects/Dictionary/Dictionary/Program.cs
-```
--->
+In general, the main goal is to avoid having parts of the array filled while other parts are left unused, a situation known as *clustering*.
+This situation *will* happen if too many keys are given the same hash and index, something that is hard to predict since keys will in general not be uniformly distributed and not known ahead of time.
+Linear probing is very bad in solving this problem, since the clusters are "spread out continuously", quadratic probing is an improvement, but only partially solve this issue, since keys with identical hashes will still follow teh same sequence. 
+Double hashing is a bit better at solving this problem, since keys with identical hashes may drift apart significantly when the secondary hash function is applied.
 
-x
+This general discussion relates to performance and requires to measure the dictionary's load factor, which is the number of entries occupied in the hash table divided by the table length (or number of "buckets").
+Of course, open-addressed hash table cannot have a load factor greater than 1, but other techniques, such as chaining, allows for larger load factors.
+
+<!--
+Double hashing, in which the interval between probes is computed by a secondary hash function
 
+For open addressing schemes, the hash function should also avoid clustering, the mapping of two or more keys to consecutive slots. Such clustering may cause the lookup cost to skyrocket, even if the load factor is low and collisions are infrequent. The popular multiplicative hash is claimed to have particularly poor clustering behavior.[22][4]
+
+https://www.javamex.com/tutorials/collections/hash_function_technical_2.shtml
+
+cf. "Implementing Double Hashing" at <https://pressbooks.palni.org/anopenguidetodatastructuresandalgorithms/chapter/hashing-and-hash-tables/>
+-->

Original file line number	Diff line number	Diff line change
`@@ -99,18 +99,18 @@ public void Clear()`
`99`	`99`	`// how many collisions we met so far.`
`100`	`100`	`public int GetIndex(TKey keyP, int countP)`
`101`	`101`	`{`
`102`		`- // countP captures the number of times we had to solve`
`103`		`- // a collision.`
	`102`	`+ // countP captures the number of times we had`
	`103`	`+ // to solve a collision.`
`104`	`104`	`return (`
`105`	`105`	`Math.Abs(keyP.GetHashCode())`
`106`	`106`	`+ CollisionResolution(keyP, countP)`
`107`	`107`	`) % table.Length;`
`108`	`108`	`}`
`109`	`109`
`110`		`- // This is the how collision are handled.`
`111`		`- // It depends on the strategy picked,`
`112`		`- // the key, and the number of time we had`
`113`		`- // to handle a collision.`
	`110`	`+ // This is how collisions are handled.`
	`111`	`+ // It depends on the strategy picked (Strategy),`
	`112`	`+ // the key (keyP), and the number of time we had`
	`113`	`+ // to handle a collision (countP).`
`114`	`114`	`private int CollisionResolution(TKey keyP, int countP)`
`115`	`115`	`{`
`116`	`116`	`if (countP == 0)`
`@@ -153,12 +153,14 @@ public void Add(TKey keyP, TValue valueP)`
`153`	`153`	`)`
`154`	`154`	`{`
`155`	`155`	`count++;`
`156`		`- if (count == table.Length) // If table is full, throw an exception.`
	`156`	`+ if (count == table.Length)`
`157`	`157`	`{`
	`158`	`+ // If table is full, throw an exception.`
`158`	`159`	`throw new ApplicationException("Table is full.");`
`159`	`160`	`}`
`160`		`- else // there is still room, generate the next index.`
	`161`	`+ else`
`161`	`162`	`{`
	`163`	`+ // There is still room, generate the next index.`
`162`	`164`	`index = GetIndex(keyP, count);`
`163`	`165`	`}`
`164`	`166`	`}`
`@@ -178,6 +180,7 @@ public void Add(TKey keyP, TValue valueP)`
`178`	`180`	`}`
`179`	`181`	`// Done with adding an element`
`180`	`182`
	`183`	`+ // We use a bool Find sub-routine`
`181`	`184`	`public bool Find(TKey keyP)`
`182`	`185`	`{`
`183`	`186`	`bool found = FindI(keyP) != -1;`
`@@ -207,7 +210,9 @@ public int FindI(TKey keyP)`
`207`	`210`	`if (!found) { index = -1; }`
`208`	`211`	`return index;`
`209`	`212`	`}`
	`213`	`+ // Done with found.`
`210`	`214`
	`215`	`+ // Removing relies also on Find:`
`211`	`216`	`public void Remove(TKey keyP)`
`212`	`217`	`{`
`213`	`218`	`int index = FindI(keyP);`
`@@ -220,4 +225,5 @@ public void Remove(TKey keyP)`
`220`	`225`	`table[index].Status = StatusType.Deleted;`
`221`	`226`	`}`
`222`	`227`	`}`
	`228`	`+ // Done with Remove`
`223`	`229`	`}`