Skip to content

Commit f3b5e22

Browse files
authored
improve typing (#141)
1 parent d5c128f commit f3b5e22

15 files changed

Lines changed: 107 additions & 89 deletions

.github/workflows/python-package.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222
- name: Install dependencies
2323
run: |
2424
python -m pip install --upgrade pip
25-
python -m pip install flake8 pytest pytest-cov
25+
python -m pip install flake8 pytest pytest-cov flake8-pyproject
2626
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
2727
- name: Lint with flake8
2828
run: |

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
# PyProbables Changelog
22

3+
### Version 0.7.0
4+
5+
***Breaking Changes***
6+
Minor breaking changes; mismatched Bloom filters raise a `SimilarityError` instead of returning `None`
7+
8+
* `BitArray`
9+
* Add ability to read and write as bytes
10+
* Add abilitt to export
11+
* Updated typing to be more consistent and correct
12+
13+
314
### Version 0.6.2
415

516
* `BloomFilterOnDisk`

probables/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
NotSupportedError,
1616
ProbablesBaseException,
1717
RotatingBloomFilterError,
18+
SimilarityError,
1819
)
1920
from probables.quotientfilter import QuotientFilter
2021
from probables.utilities import Bitarray
@@ -48,4 +49,5 @@
4849
"RotatingBloomFilterError",
4950
"QuotientFilter",
5051
"Bitarray",
52+
"SimilarityError",
5153
]

probables/blooms/bloom.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from textwrap import wrap
1919
from typing import Union
2020

21-
from probables.exceptions import InitializationError, NotSupportedError
21+
from probables.exceptions import InitializationError, NotSupportedError, SimilarityError
2222
from probables.hashes import HashFuncT, HashResultsT, KeyT, default_fnv_1a
2323
from probables.utilities import MMap, is_hex_string, is_valid_file, resolve_path
2424

@@ -368,7 +368,7 @@ def current_false_positive_rate(self) -> float:
368368
exp = math.exp(dbl)
369369
return math.pow((1 - exp), self.number_hashes)
370370

371-
def intersection(self, second: SimpleBloomT) -> Union[SimpleBloomT, None]:
371+
def intersection(self, second: SimpleBloomT) -> SimpleBloomT:
372372
"""Return a new Bloom Filter that contains the intersection of the
373373
two
374374
@@ -378,15 +378,14 @@ def intersection(self, second: SimpleBloomT) -> Union[SimpleBloomT, None]:
378378
BloomFilter: The new Bloom Filter containing the intersection
379379
Raises:
380380
TypeError: When second is not either a :class:`BloomFilter` or :class:`BloomFilterOnDisk`
381+
SimilarityError: When second is not of the same size (false_positive_rate and est_elements)
381382
Note:
382-
`second` may be a BloomFilterOnDisk object
383-
Note:
384-
If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`"""
383+
`second` may be a BloomFilterOnDisk object"""
385384
if not _verify_not_type_mismatch(second):
386385
raise TypeError(MISMATCH_MSG)
387386

388387
if self._verify_bloom_similarity(second) is False:
389-
return None
388+
raise SimilarityError("Bloom Filters are not similar")
390389

391390
res = BloomFilter(
392391
self.estimated_elements,
@@ -399,7 +398,7 @@ def intersection(self, second: SimpleBloomT) -> Union[SimpleBloomT, None]:
399398
res.elements_added = res.estimate_elements()
400399
return res
401400

402-
def union(self, second: SimpleBloomT) -> Union["BloomFilter", None]:
401+
def union(self, second: SimpleBloomT) -> "BloomFilter":
403402
"""Return a new Bloom Filter that contains the union of the two
404403
405404
Args:
@@ -408,15 +407,14 @@ def union(self, second: SimpleBloomT) -> Union["BloomFilter", None]:
408407
BloomFilter: The new Bloom Filter containing the union
409408
Raises:
410409
TypeError: When second is not either a :class:`BloomFilter` or :class:`BloomFilterOnDisk`
410+
SimilarityError: When second is not of the same size (false_positive_rate and est_elements)
411411
Note:
412-
`second` may be a BloomFilterOnDisk object
413-
Note:
414-
If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`"""
412+
`second` may be a BloomFilterOnDisk object"""
415413
if not _verify_not_type_mismatch(second):
416414
raise TypeError(MISMATCH_MSG)
417415

418416
if self._verify_bloom_similarity(second) is False:
419-
return None
417+
raise SimilarityError("Bloom Filters are not similar")
420418

421419
res = BloomFilter(
422420
self.estimated_elements,
@@ -429,7 +427,7 @@ def union(self, second: SimpleBloomT) -> Union["BloomFilter", None]:
429427
res.elements_added = res.estimate_elements()
430428
return res
431429

432-
def jaccard_index(self, second: SimpleBloomT) -> Union[float, None]:
430+
def jaccard_index(self, second: SimpleBloomT) -> float:
433431
"""Calculate the jaccard similarity score between two Bloom Filters
434432
435433
Args:
@@ -438,15 +436,14 @@ def jaccard_index(self, second: SimpleBloomT) -> Union[float, None]:
438436
float: A numeric value between 0 and 1 where 1 is identical and 0 means completely different
439437
Raises:
440438
TypeError: When second is not either a :class:`BloomFilter` or :class:`BloomFilterOnDisk`
439+
SimilarityError: When second is not of the same size (false_positive_rate and est_elements)
441440
Note:
442-
`second` may be a BloomFilterOnDisk object
443-
Note:
444-
If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`"""
441+
`second` may be a BloomFilterOnDisk object"""
445442
if not _verify_not_type_mismatch(second):
446443
raise TypeError(MISMATCH_MSG)
447444

448445
if self._verify_bloom_similarity(second) is False:
449-
return None
446+
raise SimilarityError("Bloom Filters are not similar")
450447

451448
count_union = 0
452449

@@ -694,7 +691,7 @@ def _get_element(self, idx: int) -> int:
694691

695692
def __update(self):
696693
"""update the on disk Bloom Filter and ensure everything is out to disk"""
697-
self._bloom.flush()
698-
self.__file_pointer.seek(-1 * self._UPDATE_OFFSET.size, os.SEEK_END)
699-
self.__file_pointer.write(self._EXPECTED_ELM_STRUCT.pack(self.elements_added))
700-
self.__file_pointer.flush()
694+
self._bloom.flush() # type: ignore
695+
self.__file_pointer.seek(-1 * self._UPDATE_OFFSET.size, os.SEEK_END) # type: ignore
696+
self.__file_pointer.write(self._EXPECTED_ELM_STRUCT.pack(self.elements_added)) # type: ignore
697+
self.__file_pointer.flush() # type: ignore

probables/blooms/countingbloom.py

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from probables.blooms.bloom import BloomFilter
1414
from probables.constants import UINT32_T_MAX, UINT64_T_MAX
15-
from probables.exceptions import InitializationError
15+
from probables.exceptions import InitializationError, SimilarityError
1616
from probables.hashes import HashFuncT, HashResultsT, KeyT
1717
from probables.utilities import is_hex_string, is_valid_file, resolve_path
1818

@@ -208,7 +208,7 @@ def remove_alt(self, hashes: HashResultsT, num_els: int = 1) -> int:
208208
self.elements_added -= to_remove
209209
return min_val - to_remove
210210

211-
def intersection(self, second: "CountingBloomFilter") -> Union["CountingBloomFilter", None]: # type: ignore
211+
def intersection(self, second: "CountingBloomFilter") -> "CountingBloomFilter": # type: ignore
212212
"""Take the intersection of two Counting Bloom Filters
213213
214214
Args:
@@ -217,17 +217,16 @@ def intersection(self, second: "CountingBloomFilter") -> Union["CountingBloomFil
217217
CountingBloomFilter: The new Counting Bloom Filter containing the union
218218
Raises:
219219
TypeError: When second is not a :class:`CountingBloomFilter`
220+
SimilarityError: When second is not of the same size (false_positive_rate and est_elements)
220221
Note:
221222
The elements_added property will be set to the estimated number of unique elements \
222-
added as found in estimate_elements()
223-
Note:
224-
If `second` is not of the same size (false_positive_rate and est_elements) then \
225-
this will return `None`"""
223+
added as found in estimate_elements()"""
226224
if not _verify_not_type_mismatch(second):
227225
raise TypeError(MISMATCH_MSG)
228226

229227
if self._verify_bloom_similarity(second) is False:
230-
return None
228+
raise SimilarityError("Counting Bloom Filters are not similar enough to calculate similarity")
229+
231230
res = CountingBloomFilter(
232231
est_elements=self.estimated_elements,
233232
false_positive_rate=self.false_positive_rate,
@@ -241,7 +240,7 @@ def intersection(self, second: "CountingBloomFilter") -> Union["CountingBloomFil
241240
res.elements_added = res.estimate_elements()
242241
return res
243242

244-
def jaccard_index(self, second: "CountingBloomFilter") -> Union[float, None]: # type:ignore
243+
def jaccard_index(self, second: "CountingBloomFilter") -> float: # type: ignore
245244
"""Take the Jaccard Index of two Counting Bloom Filters
246245
247246
Args:
@@ -250,15 +249,14 @@ def jaccard_index(self, second: "CountingBloomFilter") -> Union[float, None]: #
250249
float: A numeric value between 0 and 1 where 1 is identical and 0 means completely different
251250
Raises:
252251
TypeError: When second is not a :class:`CountingBloomFilter`
252+
SimilarityError: When second is not of the same size (false_positive_rate and est_elements)
253253
Note:
254-
The Jaccard Index is based on the unique set of elements added and not the number of each element added
255-
Note:
256-
If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`"""
254+
The Jaccard Index is based on the unique set of elements added and not the number of each element added"""
257255
if not _verify_not_type_mismatch(second):
258256
raise TypeError(MISMATCH_MSG)
259257

260258
if self._verify_bloom_similarity(second) is False:
261-
return None
259+
raise SimilarityError("Counting Bloom Filters are not similar enough to calculate similarity")
262260

263261
count_union = 0
264262
count_inter = 0
@@ -271,7 +269,7 @@ def jaccard_index(self, second: "CountingBloomFilter") -> Union[float, None]: #
271269
return 1.0
272270
return count_inter / count_union
273271

274-
def union(self, second: "CountingBloomFilter") -> Union["CountingBloomFilter", None]: # type:ignore
272+
def union(self, second: "CountingBloomFilter") -> "CountingBloomFilter": # type:ignore
275273
"""Return a new Countiong Bloom Filter that contains the union of
276274
the two
277275
@@ -281,16 +279,16 @@ def union(self, second: "CountingBloomFilter") -> Union["CountingBloomFilter", N
281279
CountingBloomFilter: The new Counting Bloom Filter containing the union
282280
Raises:
283281
TypeError: When second is not a :class:`CountingBloomFilter`
282+
SimilarityError: When second is not of the same size (false_positive_rate and est_elements)
284283
Note:
285284
The elements_added property will be set to the estimated number of unique elements added as \
286-
found in estimate_elements()
287-
Note:
288-
If `second` is not of the same size (false_positive_rate and est_elements) then this will return `None`"""
285+
found in estimate_elements()"""
289286
if not _verify_not_type_mismatch(second):
290287
raise TypeError(MISMATCH_MSG)
291288

292289
if self._verify_bloom_similarity(second) is False:
293-
return None
290+
raise SimilarityError("Counting Bloom Filters are not similar enough to calculate similarity")
291+
294292
res = CountingBloomFilter(
295293
est_elements=self.estimated_elements,
296294
false_positive_rate=self.false_positive_rate,

probables/cuckoo/countingcuckoo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ def _parse_buckets(self, d: ByteString) -> None:
304304
start = end
305305
end += bin_size
306306

307-
def _expand_logic(self, extra_fingerprint: "CountingCuckooBin") -> None:
307+
def _expand_logic(self, extra_fingerprint: Union["CountingCuckooBin", None]) -> None:
308308
"""the logic to acutally expand the cuckoo filter"""
309309
# get all the fingerprints
310310
fingerprints = self._setup_expand(extra_fingerprint)

probables/cuckoo/cuckoo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,7 @@ def _indicies_from_fingerprint(self, fingerprint):
487487
Args:
488488
fingerprint (int): The fingerprint to use for generating indicies"""
489489
idx_1 = fingerprint % self.capacity
490-
idx_2 = self.__hash_func(str(fingerprint)) % self.capacity
490+
idx_2 = self.__hash_func(str(fingerprint)) % self.capacity # type: ignore
491491
return idx_1, idx_2
492492

493493
def _generate_fingerprint_info(self, key: KeyT) -> tuple[int, int, int]:
@@ -497,7 +497,7 @@ def _generate_fingerprint_info(self, key: KeyT) -> tuple[int, int, int]:
497497
key (str): The element for which information is to be generated
498498
"""
499499
# generate the fingerprint along with the two possible indecies
500-
hash_val = self.__hash_func(key)
500+
hash_val = self.__hash_func(key) # type: ignore
501501
fingerprint = get_x_bits(hash_val, 64, self.fingerprint_size_bits, True)
502502
idx_1, idx_2 = self._indicies_from_fingerprint(fingerprint)
503503

probables/exceptions.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,17 @@ def __init__(self, message: str) -> None:
3737
super().__init__(self.message)
3838

3939

40+
class SimilarityError(ProbablesBaseException):
41+
"""Similarity Exception
42+
43+
Args:
44+
message (str): The error message to be reported"""
45+
46+
def __init__(self, message: str) -> None:
47+
self.message = message
48+
super().__init__(self.message)
49+
50+
4051
class CuckooFilterFullError(ProbablesBaseException):
4152
"""Cuckoo Filter Full Exception
4253

probables/hashes.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,13 @@
99

1010
KeyT = Union[str, bytes]
1111
SimpleHashT = Callable[[KeyT, int], int]
12+
SimpleHashBytesT = Callable[[KeyT, int], bytes]
1213
HashResultsT = list[int]
1314
HashFuncT = Callable[[KeyT, int], HashResultsT]
1415
HashFuncBytesT = Callable[[KeyT, int], bytes]
1516

1617

17-
def hash_with_depth_bytes(func: HashFuncBytesT) -> HashFuncT:
18+
def hash_with_depth_bytes(func: Union[HashFuncBytesT, SimpleHashBytesT]) -> HashFuncT:
1819
"""Decorator to turns a function taking a single key and hashes it to
1920
bytes. Wraps functions to be used in Bloom filters and Count-Min sketch
2021
data structures.
@@ -40,7 +41,7 @@ def hashing_func(key, depth=1):
4041
return hashing_func
4142

4243

43-
def hash_with_depth_int(func: HashFuncT) -> HashFuncT:
44+
def hash_with_depth_int(func: Union[HashFuncT, SimpleHashT]) -> HashFuncT:
4445
"""Decorator to turn a function that takes a single key and hashes it to
4546
an int. Wraps functions to be used in Bloom filters and Count-Min
4647
sketch data structures.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ max-line-length = 120
5757
max-line-length = 120
5858

5959
[tool.flake8]
60+
extend-ignore = ["E203"]
6061
max-line-length = 120
6162

6263
[tool.isort]

0 commit comments

Comments
 (0)