Skip to content

Commit ba7dbd5

Browse files
Merge pull request #8924 from ThomasWaldmann/bh64-balanced-bit-distribution
buzhash64: deterministically create a balanced bh table
2 parents ffe55d2 + 3617b63 commit ba7dbd5

8 files changed

Lines changed: 469 additions & 45 deletions

File tree

src/borg/archiver/benchmark_cmd.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -134,23 +134,33 @@ def do_benchmark_cpu(self, args):
134134
key_96 = os.urandom(12)
135135

136136
import io
137-
from ..chunkers import get_chunker
137+
from ..chunkers import get_chunker # noqa
138138

139139
print("Chunkers =======================================================")
140140
size = "1GB"
141141

142-
def chunkit(chunker_name, *args, **kwargs):
142+
def chunkit(ch):
143143
with io.BytesIO(random_10M) as data_file:
144-
ch = get_chunker(chunker_name, *args, **kwargs)
145144
for _ in ch.chunkify(fd=data_file):
146145
pass
147146

148-
for spec, func in [
149-
("buzhash,19,23,21,4095", lambda: chunkit("buzhash", 19, 23, 21, 4095, sparse=False)),
150-
("buzhash64,19,23,21,4095", lambda: chunkit("buzhash64", 19, 23, 21, 4095, sparse=False)),
151-
("fixed,1048576", lambda: chunkit("fixed", 1048576, sparse=False)),
147+
for spec, setup, func, vars in [
148+
(
149+
"buzhash,19,23,21,4095",
150+
"ch = get_chunker('buzhash', 19, 23, 21, 4095, sparse=False)",
151+
"chunkit(ch)",
152+
locals(),
153+
),
154+
# note: the buzhash64 chunker creation is rather slow, so we must keep it in setup
155+
(
156+
"buzhash64,19,23,21,4095",
157+
"ch = get_chunker('buzhash64', 19, 23, 21, 4095, sparse=False)",
158+
"chunkit(ch)",
159+
locals(),
160+
),
161+
("fixed,1048576", "ch = get_chunker('fixed', 1048576, sparse=False)", "chunkit(ch)", locals()),
152162
]:
153-
print(f"{spec:<24} {size:<10} {timeit(func, number=100):.3f}s")
163+
print(f"{spec:<24} {size:<10} {timeit(func, setup, number=100, globals=vars):.3f}s")
154164

155165
from ..checksums import crc32, xxh64
156166

src/borg/chunkers/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ def get_chunker(algo, *params, **kw):
1313
# key.chunk_seed only has 32bits
1414
seed = key.chunk_seed if key is not None else 0
1515
# for buzhash64, we want a much longer key, so we derive it from the id key
16-
bh64_key = key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b""
16+
bh64_key = (
17+
key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"\0" * 32
18+
)
1719
if algo == "buzhash":
1820
return Chunker(seed, *params, sparse=sparse)
1921
if algo == "buzhash64":

src/borg/chunkers/buzhash64.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ API_VERSION: str
66

77
def buzhash64(data: bytes, key: bytes) -> int: ...
88
def buzhash64_update(sum: int, remove: int, add: int, len: int, key: bytes) -> int: ...
9+
def buzhash64_get_table(key: bytes) -> List[int]: ...
910

1011
class ChunkerBuzHash64:
1112
def __init__(

src/borg/chunkers/buzhash64.pyx

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@ API_VERSION = '1.2_01'
44

55
import cython
66
import time
7-
from hashlib import sha256
87

98
from cpython.bytes cimport PyBytes_AsString
109
from libc.stdint cimport uint8_t, uint64_t
1110
from libc.stdlib cimport malloc, free
1211
from libc.string cimport memcpy, memmove
1312

13+
from ..crypto.low_level import CSPRNG
14+
1415
from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
1516
from .reader import FileReader, Chunk
1617

@@ -40,14 +41,31 @@ cdef extern from *:
4041
@cython.boundscheck(False) # Deactivate bounds checking
4142
@cython.wraparound(False) # Deactivate negative indexing.
4243
cdef uint64_t* buzhash64_init_table(bytes key):
43-
"""Initialize the buzhash table using the given key."""
44-
cdef int i
44+
"""
45+
Generate a balanced pseudo-random table deterministically from a 256-bit key.
46+
Balanced means that for each bit position 0..63, exactly 50% of the table values have the bit set to 1.
47+
"""
48+
# Create deterministic random number generator
49+
rng = CSPRNG(key)
50+
51+
cdef int i, j, bit_pos
4552
cdef uint64_t* table = <uint64_t*>malloc(2048) # 256 * sizeof(uint64_t)
53+
54+
# Initialize all values to 0
4655
for i in range(256):
47-
# deterministically generate a pseudo-random 64-bit unsigned integer for table entry i involving the key:
48-
v = f"{i:02x}".encode() + key
49-
d64 = sha256(v).digest()[:8]
50-
table[i] = <uint64_t> int.from_bytes(d64, byteorder='little')
56+
table[i] = 0
57+
58+
# For each bit position, deterministically assign exactly 128 positions to have that bit set
59+
for bit_pos in range(64):
60+
# Create a list of indices and shuffle deterministically
61+
indices = list(range(256))
62+
rng.shuffle(indices)
63+
64+
# Set the bit at bit_pos for the first 128 shuffled indices
65+
for i in range(128):
66+
j = indices[i]
67+
table[j] |= (1ULL << bit_pos)
68+
5169
return table
5270

5371

@@ -289,3 +307,14 @@ def buzhash64_update(uint64_t sum, unsigned char remove, unsigned char add, size
289307
sum = _buzhash64_update(sum, remove, add, len, table)
290308
free(table)
291309
return sum
310+
311+
312+
def buzhash64_get_table(bytes key):
313+
"""Get the buzhash table generated from <key>."""
314+
cdef uint64_t *table
315+
cdef int i
316+
table = buzhash64_init_table(key)
317+
try:
318+
return [table[i] for i in range(256)]
319+
finally:
320+
free(table)

src/borg/crypto/low_level.pyx

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ from math import ceil
4040

4141
from cpython cimport PyMem_Malloc, PyMem_Free
4242
from cpython.buffer cimport PyBUF_SIMPLE, PyObject_GetBuffer, PyBuffer_Release
43+
from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AsString
44+
from libc.stdlib cimport malloc, free
45+
from libc.stdint cimport uint8_t, uint32_t, uint64_t
46+
from libc.string cimport memset, memcpy
4347

4448
API_VERSION = '1.3_01'
4549

@@ -714,3 +718,161 @@ def blake2b_256(key, data):
714718

715719
def blake2b_128(data):
716720
return hashlib.blake2b(data, digest_size=16).digest()
721+
722+
723+
cdef class CSPRNG:
724+
"""
725+
Cryptographically Secure Pseudo-Random Number Generator based on AES-CTR mode.
726+
727+
This class provides methods for generating random bytes and shuffling lists
728+
using a deterministic algorithm seeded with a 256-bit key.
729+
730+
The implementation uses AES-256 in CTR mode, which is a well-established
731+
method for creating a CSPRNG.
732+
"""
733+
cdef EVP_CIPHER_CTX *ctx
734+
cdef uint8_t key[32]
735+
cdef uint8_t iv[16]
736+
cdef uint8_t zeros[4096] # Static buffer for zeros
737+
cdef uint8_t buffer[4096] # Static buffer for random bytes
738+
cdef size_t buffer_size
739+
cdef size_t buffer_pos
740+
741+
def __cinit__(self, bytes seed_key):
742+
"""
743+
Initialize the CSPRNG with a 256-bit key.
744+
745+
:param seed_key: A 32-byte key used as the seed for the CSPRNG
746+
"""
747+
if len(seed_key) != 32:
748+
raise ValueError("Seed key must be 32 bytes (256 bits)")
749+
750+
# Initialize context
751+
self.ctx = EVP_CIPHER_CTX_new()
752+
if self.ctx == NULL:
753+
raise MemoryError("Failed to allocate cipher context")
754+
755+
self.key = seed_key[:32]
756+
757+
# Initialize to zeros
758+
memset(self.iv, 0, 16)
759+
memset(self.zeros, 0, 4096)
760+
761+
self.buffer_size = 4096
762+
self.buffer_pos = self.buffer_size # Force refill on first use
763+
764+
# Initialize the cipher
765+
if not EVP_EncryptInit_ex(self.ctx, EVP_aes_256_ctr(), NULL, self.key, self.iv):
766+
EVP_CIPHER_CTX_free(self.ctx)
767+
raise CryptoError("Failed to initialize AES-CTR cipher")
768+
769+
def __dealloc__(self):
770+
"""Free resources when the object is deallocated."""
771+
if self.ctx != NULL:
772+
EVP_CIPHER_CTX_free(self.ctx)
773+
self.ctx = NULL
774+
775+
cdef _refill_buffer(self):
776+
"""Refill the internal buffer with random bytes."""
777+
cdef int outlen = 0
778+
779+
# Encrypt zeros to get random bytes
780+
if not EVP_EncryptUpdate(self.ctx, self.buffer, &outlen, self.zeros, self.buffer_size):
781+
raise CryptoError("Failed to generate random bytes")
782+
if outlen != self.buffer_size:
783+
raise CryptoError("Unexpected length of random bytes")
784+
785+
self.buffer_pos = 0
786+
787+
def random_bytes(self, size_t n):
788+
"""
789+
Generate n random bytes.
790+
791+
:param n: Number of bytes to generate
792+
:return: a bytes object containing the random bytes
793+
"""
794+
# Directly create a Python bytes object of the required size
795+
cdef object py_bytes = PyBytes_FromStringAndSize(NULL, n)
796+
cdef uint8_t *result = <uint8_t *>PyBytes_AsString(py_bytes)
797+
cdef size_t remaining
798+
cdef size_t pos
799+
cdef size_t to_copy
800+
cdef size_t available
801+
802+
remaining = n
803+
pos = 0
804+
805+
while remaining > 0:
806+
if self.buffer_pos >= self.buffer_size:
807+
self._refill_buffer()
808+
809+
# Calculate how many bytes we can copy
810+
available = self.buffer_size - self.buffer_pos
811+
to_copy = remaining if remaining < available else available
812+
813+
# Copy bytes from buffer to result
814+
memcpy(result + pos, &self.buffer[self.buffer_pos], to_copy)
815+
816+
self.buffer_pos += to_copy
817+
pos += to_copy
818+
remaining -= to_copy
819+
820+
return py_bytes
821+
822+
def random_int(self, n):
823+
"""
824+
Generate a random integer in the range [0, n).
825+
826+
:param n: Upper bound (exclusive)
827+
:return: Random integer
828+
"""
829+
if n <= 0:
830+
raise ValueError("Upper bound must be positive")
831+
if n == 1:
832+
return 0
833+
834+
# Calculate the number of bits and bytes needed
835+
bits_needed = 0
836+
temp = n - 1
837+
while temp > 0:
838+
bits_needed += 1
839+
temp >>= 1
840+
bytes_needed = (bits_needed + 7) // 8
841+
842+
# Generate random bytes
843+
mask = (1 << bits_needed) - 1
844+
max_attempts = 1000 # Prevent infinite loop
845+
846+
# Rejection sampling to avoid bias
847+
attempts = 0
848+
while attempts < max_attempts:
849+
attempts += 1
850+
random_data = self.random_bytes(bytes_needed)
851+
result = int.from_bytes(random_data, byteorder='big')
852+
853+
# Apply mask to get the right number of bits
854+
result &= mask
855+
if result < n:
856+
return result
857+
858+
# If we reach here, we've made too many attempts
859+
# Fall back to a slightly biased but guaranteed-to-terminate method
860+
random_data = self.random_bytes(bytes_needed)
861+
result = int.from_bytes(random_data, byteorder='big')
862+
return result % n
863+
864+
def shuffle(self, list items):
865+
"""
866+
Shuffle a list in-place using the Fisher-Yates algorithm.
867+
868+
:param items: List to shuffle
869+
"""
870+
cdef size_t n = len(items)
871+
cdef size_t i, j
872+
873+
for i in range(n - 1, 0, -1):
874+
# Generate random index j such that 0 <= j <= i
875+
j = self.random_int(i + 1)
876+
877+
# Swap items[i] and items[j]
878+
items[i], items[j] = items[j], items[i]

src/borg/testsuite/chunkers/buzhash64_self_test.py

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,63 +6,69 @@
66
from ...chunkers import get_chunker
77
from ...chunkers.buzhash64 import buzhash64, buzhash64_update, ChunkerBuzHash64
88
from ...constants import * # NOQA
9+
from ...helpers import hex_to_bin
910
from .. import BaseTestCase
1011
from . import cf
1112

13+
# from os.urandom(32)
14+
key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da")
15+
key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8")
16+
key2 = hex_to_bin("57174a65fde67fe127b18430525b50a58406f1bd6cc629535208c7832e181067")
17+
1218

1319
class ChunkerBuzHash64TestCase(BaseTestCase):
1420
def test_chunkify64(self):
1521
data = b"0" * int(1.5 * (1 << CHUNK_MAX_EXP)) + b"Y"
16-
parts = cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
22+
parts = cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data)))
1723
self.assert_equal(len(parts), 2)
1824
self.assert_equal(b"".join(parts), data)
19-
self.assert_equal(cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
25+
self.assert_equal(cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b""))), [])
2026
self.assert_equal(
21-
cf(ChunkerBuzHash64(b"0", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
22-
[b"fo", b"obarbo", b"ob", b"azfo", b"obarbo", b"ob", b"azfo", b"obarbo", b"obaz"],
27+
cf(ChunkerBuzHash64(key0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
28+
[b"foobarb", b"ooba", b"zf", b"oobarb", b"ooba", b"zf", b"oobarb", b"oobaz"],
2329
)
2430
self.assert_equal(
25-
cf(ChunkerBuzHash64(b"1", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
26-
[b"fooba", b"rboobaz", b"fooba", b"rboobaz", b"fooba", b"rboobaz"],
31+
cf(ChunkerBuzHash64(key1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
32+
[b"fo", b"oba", b"rb", b"oob", b"azf", b"ooba", b"rb", b"oob", b"azf", b"ooba", b"rb", b"oobaz"],
2733
)
2834
self.assert_equal(
29-
cf(ChunkerBuzHash64(b"2", 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
30-
[b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
35+
cf(ChunkerBuzHash64(key2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b"foobarboobaz" * 3))),
36+
[b"foobar", b"booba", b"zfoobar", b"booba", b"zfoobar", b"boobaz"],
3137
)
3238
self.assert_equal(
33-
cf(ChunkerBuzHash64(b"0", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
34-
[b"foobarb", b"oobaz", b"foobarb", b"oobaz", b"foobarb", b"oobaz"],
39+
cf(ChunkerBuzHash64(key0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
40+
[b"foobarbo", b"obaz", b"foobarbo", b"obaz", b"foobarbo", b"obaz"],
3541
)
3642
self.assert_equal(
37-
cf(ChunkerBuzHash64(b"1", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
38-
[b"foobarbo", b"obazfo", b"obarbo", b"obazfo", b"obarbo", b"obaz"],
43+
cf(ChunkerBuzHash64(key1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
44+
[b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"],
3945
)
4046
self.assert_equal(
41-
cf(ChunkerBuzHash64(b"2", 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
42-
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
47+
cf(ChunkerBuzHash64(key2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
48+
[b"foob", b"arboobazfoob", b"arboobazfoob", b"arboobaz"],
4349
)
4450
self.assert_equal(
45-
cf(ChunkerBuzHash64(b"0", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
46-
[b"foobarbo", b"obazfoobarb", b"oobazfoo", b"barboobaz"],
51+
cf(ChunkerBuzHash64(key0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
52+
[b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
4753
)
4854
self.assert_equal(
49-
cf(ChunkerBuzHash64(b"1", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
50-
[b"foobarbo", b"obazfoobarbo", b"obazfoobarbo", b"obaz"],
55+
cf(ChunkerBuzHash64(key1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
56+
[b"foobarboob", b"azfoobarboob", b"azfoobarboobaz"],
5157
)
5258
self.assert_equal(
53-
cf(ChunkerBuzHash64(b"2", 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
54-
[b"foobarboobaz", b"foobarboobaz", b"foobarboobaz"],
59+
cf(ChunkerBuzHash64(key2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b"foobarboobaz" * 3))),
60+
[b"foobarboobazfoob", b"arboobazfoob", b"arboobaz"],
5561
)
5662

5763
def test_buzhash64(self):
58-
self.assert_equal(buzhash64(b"abcdefghijklmnop", b"0"), 13095190927899934478)
59-
self.assert_equal(buzhash64(b"abcdefghijklmnop", b"1"), 10129419249308136910)
60-
expected = buzhash64(b"abcdefghijklmnop", b"1")
61-
previous = buzhash64(b"Xabcdefghijklmno", b"1")
62-
this = buzhash64_update(previous, ord("X"), ord("p"), 16, b"1")
64+
self.assert_equal(buzhash64(b"abcdefghijklmnop", key0), 17414563089559790077)
65+
self.assert_equal(buzhash64(b"abcdefghijklmnop", key1), 1397285894609271345)
66+
expected = buzhash64(b"abcdefghijklmnop", key0)
67+
previous = buzhash64(b"Xabcdefghijklmno", key0)
68+
this = buzhash64_update(previous, ord("X"), ord("p"), 16, key0)
6369
self.assert_equal(this, expected)
6470
# Test with more than 63 bytes to make sure our barrel_shift macro works correctly
65-
self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, b"0"), 9064183923498167899)
71+
self.assert_equal(buzhash64(b"abcdefghijklmnopqrstuvwxyz" * 4, key0), 17683050804041322250)
6672

6773
def test_small_reads64(self):
6874
class SmallReadFile:

0 commit comments

Comments
 (0)