-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSuffixArray.py
More file actions
137 lines (105 loc) · 5.11 KB
/
SuffixArray.py
File metadata and controls
137 lines (105 loc) · 5.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import pprint
from basics.data_structure.SuffixTree import SuffixTree
class Suffix:
def __init__(self, suffix: str, suffix_start_index: int):
self.suffix = suffix
self.suffix_start_index = suffix_start_index
self.longest_common_prefix_length = 0
class SuffixArray:
@staticmethod
def compute_longest_common_prefix_length(string1: str, string2: str) -> int:
index = 0
index_max = min(len(string1), len(string2)) - 1
while index <= index_max and string1[index] == string2[index]:
index += 1
return index
def __init__(self, word: str):
self.suffix_array: list[str] = []
self.suffix_start_index_array: list[int] = []
self.longest_common_prefix_length_array: list[int] = []
suffixes: list[Suffix] = [Suffix(word[left:], left) for left in range(len(word)+1)] # 包含末尾的空串后缀
suffixes.sort(key=lambda suffix_class_instance: suffix_class_instance.suffix)
for i, suffix_instance in enumerate(suffixes):
self.suffix_array.append(suffix_instance.suffix)
self.suffix_start_index_array.append(suffix_instance.suffix_start_index)
lcp_len = (0
if i == 0
else self.compute_longest_common_prefix_length(self.suffix_array[i-1], self.suffix_array[i]))
suffix_instance.longest_common_prefix_length = lcp_len
self.longest_common_prefix_length_array.append(lcp_len)
def __repr__(self):
return pprint.pformat([
self.suffix_array,
self.suffix_start_index_array,
self.longest_common_prefix_length_array
])
def contains_suffix(self, suffix: str) -> bool:
def binary_search(lo: int, hi: int) -> bool:
while True:
if lo > hi:
return False
mid = (lo + hi) // 2
if self.suffix_array[mid] < suffix:
lo = mid + 1
elif self.suffix_array[mid] > suffix:
hi = mid - 1
else:
return True
return binary_search(0, len(self.suffix_array) - 1)
def is_substring(self, substring: str) -> bool:
return len(self.substrings(substring)) > 0
def substrings(self, substring: str) -> list[int]:
# 两次二分查找, 目标区间是 [lower, upper)
# lower_boundary是第一个满足suffix[:len(substring)] >= substring的元素的索引
def binary_search_lower_boundary(lo: int, hi: int) -> int:
# 函数被调用时, 函数自己的视角: array[hi]满足
# 返回值范围[lo...hi]: [lo, hi)内所有元素都满足 -> lo, [lo, hi)内所有元素都不满足 -> hi
while True:
if lo == hi:
return lo
mid = (lo + hi) // 2
if self.suffix_array[mid][:len(substring)] < substring:
lo = mid + 1
else: # self.suffix_array[mid][:len(substring)] >= substring
hi = mid
# upper_boundary是第一个满足suffix[:len(substring)] > substring的元素的索引
def binary_search_upper_boundary(lo: int, hi: int) -> int:
# 函数被调用时, 函数自己的视角: array[hi]满足
# 返回值范围[lo...hi]: [lo, hi)内所有元素都满足 -> lo, [lo, hi)内所有元素都不满足 -> hi
while True:
if lo == hi:
return lo
mid = (lo + hi) // 2
if self.suffix_array[mid][:len(substring)] <= substring:
lo = mid + 1
else: # self.suffix_array[mid][:len(substring)] > substring
hi = mid
lower_boundary = binary_search_lower_boundary(0, len(self.suffix_array))
upper_boundary = binary_search_upper_boundary(0, len(self.suffix_array))
return sorted(self.suffix_start_index_array[lower_boundary: upper_boundary])
# https://www.youtube.com/playlist?list=PLDV1Zeh2NRsCQ_Educ7GCNs3mvzpXhHW5
# https://www.youtube.com/watch?v=IzMxbboPcqQ&list=PLM_KIlU0WoXmkV4QB1Dg8PtJaHTdWHwRS&index=72
if __name__ == '__main__':
suffix_array = SuffixArray("mississippi")
print(repr(suffix_array))
print(suffix_array.substrings("a"))
print(suffix_array.substrings("z"))
print(suffix_array.substrings("ssq"))
print(suffix_array.substrings("m"))
print(suffix_array.substrings("i"))
print(suffix_array.substrings("ssi"))
print(suffix_array.substrings("si"))
print(suffix_array.contains_suffix("sippi"))
print(suffix_array.contains_suffix("pp"))
print(suffix_array.contains_suffix(""))
suffix_tree = SuffixTree("mississippi")
print(suffix_tree.substrings("a"))
print(suffix_tree.substrings("z"))
print(suffix_tree.substrings("ssq"))
print(suffix_tree.substrings("m"))
print(suffix_tree.substrings("i"))
print(suffix_tree.substrings("ssi"))
print(suffix_tree.substrings("si"))
print(suffix_tree.contains_suffix("sippi"))
print(suffix_tree.contains_suffix("pp"))
print(suffix_tree.contains_suffix(""))