class MyHashSet() {
private val table = BooleanArray(1000000 + 1)
fun add(key: Int) {
table[key] = key
}
fun remove(key: Int) {
table[key] = false
}
fun contains(key: Int): Boolean {
return table[key]
}
}- We use a fixed-size array (
buckets) to store the keys. - Each bucket will store a list of keys to handle collision.
- We use a simple hash function to find the bucket.
class MyHashSet() {
private val size = 769 // A prime number reduces collisions
// We can use dynamic array or linked list to handle collision.
// Choose one of them.
private val buckets = Array(size) { mutableListOf<Int>() }
// private val buckets = Array(size) { LinkedList<Int>() }
fun add(key: Int) {
val index = hash(key)
// Remember to check if the key already exists.
if (contains(key).not()) {
buckets[index].add(key)
}
}
fun remove(key: Int) {
val index = hash(key)
buckets[index].remove(key)
}
fun contains(key: Int): Boolean {
val index = hash(key)
return key in buckets[index]
}
private fun hash(key: Int) = key % size
}- Time Complexity:
O(1)foradd(),remove(),contains()on average.O(n)for worst case. See below explanation. - Space Complexity:
O(n)wherenis the number of all possible values.
With a uniform hash distribution, each key is equally likely to go into any bucket, so:
- Total number of keys:
n. - Number of buckets:
k.
On average, each bucket will have n / k keys. If k is large enough and hash function is good, then n / k is small, so the time complexity is O(1). See more detail explanation in the topic
- We use
key % sizewheresize = 769a prime number, which helps reduce clustering in the same bucket. - Each bucket holds a small list of keys, and under good distribution (hash function is good), these lists are short (constant-sized), even if we check
contains()beforeadd(), the time complexity is stillO(1). - The worst case remains
O(n)if all keys are in the same bucket due to poor hash distribution which causes linear scan, but it's rare with a good hash function.
- 為何用質數取餘數當哈希?假設我們用
keys = [0,6,12,18,24,30]和size = 10取餘數之後都只會出現0,6,2,8,4,0,這樣會導致所有 key 都只會出現在重複的幾個 bucket 中,有些位置永遠都會用不到,這樣會導致時間複雜度變成O(n)。然而如果使用質數像是size = 7,取餘數後0,6,5,4,3,2,1,0,...,這樣會讓所有 key 都均勻分布在不同的 bucket 中,這樣可以讓時間複雜度變成O(1)。
Or equivalent with linked list implmented by ourselves
data class SetNode(
val key: Int,
var next: SetNode? = null
)
class MyHashSet() {
private val size = 769
private val multipler = 12582917L
// Initialize the bucket with sentinal node.
private val buckets = Array<SetNode>(size) { SetNode(-1) }
fun add(key: Int) {
val hash = hash(key)
val head = buckets[hash]
var previous: SetNode? = head
var current: SetNode? = head.next
// Pay attention: we don't use general way to find the last node.
// because we have to check if the key already exists, so we iterate
// every node to check if the key already exists.
//
// The following code doesn't work because we don't check if key already exists.
// var current: SetNode? = head
// while (current?.next != null) {
// current = current.next
// }
// current?.next = SetNode(key)
// We iterate every node to check if the key already exists.
while (current != null) {
if (current.key == key) return
previous = current
current = current.next
}
previous?.next = SetNode(key)
}
fun remove(key: Int) {
val hash = hash(key)
val head = buckets[hash]
var previous: SetNode? = head
var current: SetNode? = head.next
while (current != null && current.key != key) {
previous = current
current = current.next
}
previous?.next = current.next
}
fun contains(key: Int): Boolean {
val hash = hash(key)
val head = buckets[hash]
var current: SetNode? = head.next
while (current != null) {
if (current.key == key) return true
current = current.next
}
return false
}
private fun hash(key: Int): Int {
return (key.toLong() * multipler % size).toInt()
}
}- Time Complexity:
O(n / k)wherenis the number of all possible values andkis the number of buckets. - Space Complexity:
O(n + k)wherenis the number of all possible values andkis the number of buckets.
private const val EMPTY = -1
private const val REMOVED = -2
class MyHashSet() {
private val size = 10007
private val multipler = 12582917L
// We initialize the table with -1, which means empty.
private val table = IntArray(size) { EMPTY }
fun add(key: Int) {
// We check if the key exists, otherwise, add at the empty slot.
var probe = 0
var hash = hash(key, probe)
while (table[hash] != EMPTY) {
if (table[hash] == key) return
probe++
hash = hash(key, probe)
}
table[hash] = key
}
fun remove(key: Int) {
var probe = 0
var hash = hash(key, probe)
while (table[hash] != EMPTY) {
if (table[hash] == key) {
table[hash] = REMOVED
return
}
probe++
hash = hash(key, probe)
}
}
fun contains(key: Int): Boolean {
var probe = 0
var hash = hash(key, probe)
while (table[hash] != EMPTY) {
if (table[hash] == key) return true
probe++
hash = hash(key, probe)
}
return false
}
// We use linear probing to find the next empty slot.
private fun hash(key: Int, probe: Int): Int {
return (hash(key) + probe) % size
}
private fun hash(key: Int): Int {
return (key.toLong() * multipler % size).toInt()
}
}- Time Complexity:
O(1)on average if hash function is good and low load factor,O(n)wherenis the number of all possible values for worst case. - Space Complexity:
O(n)wherenis the number of all possible values.
When we remove an element, we have to mark it as deleted instead of setting it to EMPTY. This is because we have to ensure that the probing sequence is not broken. If we set the slot to EMPTY, the probing sequence will be broken and we can't find the element that is inserted after the deleted element.
For example, we have the following sequence of operations:
size = 10
hash(x) = x % 10
add(5)
add(15)
remove(5)
contains(15) // Expected: trueIf we set the slot to EMPTY when we remove the element, we see
add(5) // index 5 = 5
add(15) // index 5 is occupied -> linear probing -> index 6 = 15
remove(5) // index 5 = EMPTY
contains(15) // index 5 = EMPTY, break the while loop, return false (incorrect)If we set the slot to REMOVED when we remove the element, we see
add(5) // index 5 = 5
add(15) // index 5 is occupied -> linear probing -> index 6 = 15
remove(5) // index 5 = REMOVED
contains(15) // index 5 = REMOVED, continue probing -> index 6 = 15, return true (correct)Probing should continue until we find the key or reach an empty slot. If we use EMPTY for removed elements, the search will stop too early, failing to find valid keys that were probed forward.
But we have to modify our add() to reuse the index marked as REMOVED, when inserting a new key, we should reuse REMOVED slots if they appear before any EMPTY slot. And contains() should skip REMOVED and continue probling. This ensure that the probing sequence is not broken and we can reuse the slot that is removed before. (Not implemented in the above code)