Skip to content

Commit ff51b84

Browse files
committed
feat: hashlib compatibility
- algorithms_available / algorithms_guaranteed: module attributes - Str rejection: TypeError('Strings must be encoded before hashing') - None rejection: TypeError('object supporting the buffer API required') - data= keyword argument across all entry points (constructor, update, one-shot) - Full buffer type support: bytes, bytearray, memoryview, array, mmap, PickleBuffer, ctypes - tp_vectorcall on all 4 type constructors (CPython fast path) - METH_FASTCALL on all 12 module-level one-shot functions - METH_FASTCALL|METH_KEYWORDS on all 4 update() methods - Manual arg parsing in tp_init for PyPy fallback - Reject unknown keywords, duplicate args, extra positional args globally - _get_buffer_or_str, _parse_fastcall_args, _check_kwargs shared helpers - Py_ALWAYS_INLINE on all performance-critical helpers - PyLong_FromUnsigned* replaces Py_BuildValue - Remove hexdigits lookup table (regressed) - 120 tests (15 hashlib compat, 32 fastcall, 34 benchmark, 39 original) - Tested on CPython 3.9-3.15 and PyPy 3.9-3.11
1 parent b24c873 commit ff51b84

10 files changed

Lines changed: 747 additions & 324 deletions

src/_xxhash.c

Lines changed: 407 additions & 157 deletions
Large diffs are not rendered by default.

tests/test_benchmark.py

Lines changed: 5 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -90,42 +90,17 @@ def test_xxh64_hexdigest_5b():
9090
xxhash.xxh64_hexdigest(DATA_5B)
9191

9292

93-
# ── str input (tests _get_buffer_or_str UTF-8 encoding path) ────────
94-
95-
DATA_STR = "hello world"
96-
97-
98-
@pytest.mark.benchmark
99-
def test_xxh32_intdigest_str():
100-
xxhash.xxh32_intdigest(DATA_STR)
101-
102-
103-
@pytest.mark.benchmark
104-
def test_xxh64_intdigest_str():
105-
xxhash.xxh64_intdigest(DATA_STR)
106-
107-
108-
@pytest.mark.benchmark
109-
def test_xxh3_64_intdigest_str():
110-
xxhash.xxh3_64_intdigest(DATA_STR)
111-
112-
113-
@pytest.mark.benchmark
114-
def test_xxh3_128_intdigest_str():
115-
xxhash.xxh3_128_intdigest(DATA_STR)
116-
117-
11893
# ── type constructor (tests tp_vectorcall) ──────────────────────────
11994

12095

12196
@pytest.mark.benchmark
12297
def test_xxh32_ctor():
123-
xxhash.xxh32(DATA_STR)
98+
xxhash.xxh32(DATA_5B)
12499

125100

126101
@pytest.mark.benchmark
127102
def test_xxh32_ctor_seed():
128-
xxhash.xxh32(DATA_STR, seed=SEED_32)
103+
xxhash.xxh32(DATA_5B, seed=SEED_32)
129104

130105

131106
@pytest.mark.benchmark
@@ -135,17 +110,17 @@ def test_xxh32_ctor_empty():
135110

136111
@pytest.mark.benchmark
137112
def test_xxh64_ctor():
138-
xxhash.xxh64(DATA_STR, seed=SEED_64)
113+
xxhash.xxh64(DATA_5B, seed=SEED_64)
139114

140115

141116
@pytest.mark.benchmark
142117
def test_xxh3_64_ctor():
143-
xxhash.xxh3_64(DATA_STR, seed=SEED_64)
118+
xxhash.xxh3_64(DATA_5B, seed=SEED_64)
144119

145120

146121
@pytest.mark.benchmark
147122
def test_xxh3_128_ctor():
148-
xxhash.xxh3_128(DATA_STR, seed=SEED_64)
123+
xxhash.xxh3_128(DATA_5B, seed=SEED_64)
149124

150125

151126
# ── 2MB throughput: hashing dominates, call overhead negligible ─────

tests/test_fastcall.py

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,12 @@ def test_input_bytes(self):
3636
self._check(a, self.data)
3737

3838
def test_input_str(self):
39+
"""hashlib compatibility: str raises TypeError."""
3940
s = self.data.decode()
4041
for a in self.algorithms:
41-
self._check(a, s)
42+
for fn in self._funcs(a):
43+
with self.assertRaises(TypeError):
44+
fn(s)
4245

4346
def test_input_empty(self):
4447
for a in self.algorithms:
@@ -64,13 +67,13 @@ def test_positional_seed_xxh3_128(self):
6467

6568
# ── keyword input ─────────────────────────────────────────────
6669

67-
def test_keyword_input(self):
70+
def test_keyword_data(self):
6871
for a in self.algorithms:
69-
self._check(a, input=self.data)
72+
self._check(a, data=self.data)
7073

71-
def test_keyword_input_and_seed(self):
74+
def test_keyword_data_and_seed(self):
7275
for a in self.algorithms:
73-
self._check(a, input=self.data, seed=42)
76+
self._check(a, data=self.data, seed=42)
7477

7578
# ── keyword seed (with positional input) ──────────────────────
7679

@@ -105,6 +108,33 @@ def test_input_array(self):
105108
for a in self.algorithms:
106109
self._check(a, array.array('B', self.data))
107110

111+
def test_input_mmap(self):
112+
import mmap, tempfile, os
113+
with tempfile.NamedTemporaryFile(delete=False) as f:
114+
f.write(self.data)
115+
f.flush()
116+
try:
117+
with open(f.name, 'rb') as f2:
118+
with mmap.mmap(f2.fileno(), 0, access=mmap.ACCESS_READ) as m:
119+
for a in self.algorithms:
120+
self._check(a, m)
121+
finally:
122+
os.unlink(f.name)
123+
124+
def test_input_pickle_buffer(self):
125+
try:
126+
from pickle import PickleBuffer
127+
except ImportError:
128+
raise self.skipTest('PickleBuffer not available')
129+
for a in self.algorithms:
130+
self._check(a, PickleBuffer(self.data))
131+
132+
def test_input_ctypes(self):
133+
import ctypes
134+
buf = (ctypes.c_char * len(self.data)).from_buffer_copy(self.data)
135+
for a in self.algorithms:
136+
self._check(a, buf)
137+
108138

109139
class TestFastcallErrors(unittest.TestCase):
110140
"""Invalid argument passing: all error cases."""
@@ -141,16 +171,17 @@ def test_too_many_positional(self):
141171

142172
# ── unknown keyword ───────────────────────────────────────────
143173

144-
def test_unknown_keyword(self):
145-
self._assert_all_raise(TypeError, self.data, bad=1)
174+
def test_unknown_keyword_input(self):
175+
"""Old 'input' keyword is now unknown — was renamed to 'data'."""
176+
self._assert_all_raise(TypeError, input=self.data)
146177

147-
def test_unknown_keyword_input_kw(self):
148-
self._assert_all_raise(TypeError, input=self.data, bad=1)
178+
def test_unknown_keyword_data_kw(self):
179+
self._assert_all_raise(TypeError, data=self.data, bad=1)
149180

150181
# ── duplicate arguments ───────────────────────────────────────
151182

152183
def test_duplicate_input(self):
153-
self._assert_all_raise(TypeError, self.data, input=self.data)
184+
self._assert_all_raise(TypeError, self.data, data=self.data)
154185

155186
def test_duplicate_seed(self):
156187
self._assert_all_raise(TypeError, self.data, 0, seed=1)
@@ -164,15 +195,15 @@ def test_invalid_seed_keyword(self):
164195
self._assert_all_raise(TypeError, self.data, seed='bad')
165196

166197
def test_invalid_seed_with_input_kw(self):
167-
self._assert_all_raise(TypeError, input=self.data, seed='bad')
198+
self._assert_all_raise(TypeError, data=self.data, seed='bad')
168199

169200
# ── invalid input type (not str, not buffer) ──────────────────
170201

171202
def test_input_not_bytes_or_str(self):
172203
self._assert_all_raise(TypeError, 12345)
173204

174205
def test_input_not_bytes_or_str_kw(self):
175-
self._assert_all_raise(TypeError, input=12345)
206+
self._assert_all_raise(TypeError, data=12345)
176207

177208

178209
class TestFastcallSeedOverflow(unittest.TestCase):

tests/test_hashlib_compat.py

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
"""Tests for hashlib compatibility."""
2+
import unittest
3+
import xxhash
4+
5+
6+
class TestHashlibCompat(unittest.TestCase):
7+
"""Verify hashlib-compatible interface."""
8+
9+
data = b'hello world'
10+
11+
def test_algorithms_available(self):
12+
self.assertIsInstance(xxhash.algorithms_available, set)
13+
for a in ('xxh32', 'xxh64', 'xxh3_64', 'xxh3_128', 'xxh128'):
14+
self.assertIn(a, xxhash.algorithms_available)
15+
16+
def test_algorithms_guaranteed(self):
17+
self.assertEqual(xxhash.algorithms_guaranteed, xxhash.algorithms_available)
18+
19+
# ── str rejection ──────────────────────────────────────────────
20+
21+
def test_str_rejected(self):
22+
for algo in ('xxh32', 'xxh64', 'xxh3_64', 'xxh3_128'):
23+
for fn in (getattr(xxhash, f'{algo}_digest'),
24+
getattr(xxhash, f'{algo}_intdigest'),
25+
getattr(xxhash, f'{algo}_hexdigest')):
26+
# positional str
27+
with self.assertRaisesRegex(TypeError,
28+
'Strings must be encoded before hashing'):
29+
fn('hello')
30+
# keyword str
31+
with self.assertRaisesRegex(TypeError,
32+
'Strings must be encoded before hashing'):
33+
fn(data='hello')
34+
# None
35+
with self.assertRaisesRegex(TypeError,
36+
'object supporting the buffer API required'):
37+
fn(None)
38+
39+
def test_str_rejected_constructor(self):
40+
for algo in ('xxh32', 'xxh64', 'xxh3_64', 'xxh3_128'):
41+
cls = getattr(xxhash, algo)
42+
# positional str
43+
with self.assertRaisesRegex(TypeError,
44+
'Strings must be encoded before hashing'):
45+
cls('hello')
46+
# keyword str
47+
with self.assertRaisesRegex(TypeError,
48+
'Strings must be encoded before hashing'):
49+
cls(data='hello')
50+
# None
51+
with self.assertRaisesRegex(TypeError,
52+
'object supporting the buffer API required'):
53+
cls(None)
54+
with self.assertRaisesRegex(TypeError,
55+
'object supporting the buffer API required'):
56+
cls(data=None)
57+
58+
def test_str_rejected_update(self):
59+
for algo in ('xxh32', 'xxh64', 'xxh3_64', 'xxh3_128'):
60+
obj = getattr(xxhash, algo)()
61+
with self.assertRaisesRegex(TypeError,
62+
'Strings must be encoded before hashing'):
63+
obj.update('hello')
64+
# also test that bytes work after
65+
obj.update(b'hello')
66+
self.assertIsInstance(obj.intdigest(), int)
67+
# None
68+
with self.assertRaisesRegex(TypeError,
69+
'object supporting the buffer API required'):
70+
obj.update(None)
71+
with self.assertRaisesRegex(TypeError,
72+
'object supporting the buffer API required'):
73+
obj.update(data=None)
74+
75+
# ── unknown keyword ───────────────────────────────────────────
76+
77+
def test_unknown_keyword(self):
78+
for algo in ('xxh32', 'xxh64', 'xxh3_64', 'xxh3_128'):
79+
cls = getattr(xxhash, algo)
80+
with self.assertRaises(TypeError):
81+
cls(b'hello', bad=1)
82+
with self.assertRaises(TypeError):
83+
cls(data=b'hello', bad=1)
84+
obj = cls()
85+
with self.assertRaises(TypeError):
86+
obj.update(b'hello', bad=1)
87+
with self.assertRaises(TypeError):
88+
obj.update(data=b'hello', bad=1)
89+
90+
# ── data keyword ───────────────────────────────────────────────
91+
92+
def test_data_keyword(self):
93+
for algo in ('xxh32', 'xxh64', 'xxh3_64', 'xxh3_128'):
94+
obj = getattr(xxhash, algo)(self.data)
95+
d_fn = getattr(xxhash, f'{algo}_digest')
96+
i_fn = getattr(xxhash, f'{algo}_intdigest')
97+
h_fn = getattr(xxhash, f'{algo}_hexdigest')
98+
self.assertEqual(d_fn(data=self.data), obj.digest())
99+
self.assertEqual(i_fn(data=self.data), obj.intdigest())
100+
self.assertEqual(h_fn(data=self.data), obj.hexdigest())
101+
102+
def test_data_keyword_constructor(self):
103+
for algo in ('xxh32', 'xxh64', 'xxh3_64', 'xxh3_128'):
104+
cls = getattr(xxhash, algo)
105+
obj = cls(data=self.data)
106+
self.assertEqual(obj.intdigest(),
107+
getattr(xxhash, f'{algo}_intdigest')(self.data))
108+
109+
# ── digest_size / block_size / name ────────────────────────────
110+
111+
def test_digest_size(self):
112+
self.assertEqual(xxhash.xxh32().digest_size, 4)
113+
self.assertEqual(xxhash.xxh64().digest_size, 8)
114+
self.assertEqual(xxhash.xxh3_64().digest_size, 8)
115+
self.assertEqual(xxhash.xxh3_128().digest_size, 16)
116+
117+
def test_block_size(self):
118+
self.assertEqual(xxhash.xxh32().block_size, 16)
119+
self.assertEqual(xxhash.xxh64().block_size, 32)
120+
self.assertEqual(xxhash.xxh3_64().block_size, 32)
121+
self.assertEqual(xxhash.xxh3_128().block_size, 64)
122+
123+
def test_name(self):
124+
self.assertEqual(xxhash.xxh32().name, 'XXH32')
125+
self.assertEqual(xxhash.xxh64().name, 'XXH64')
126+
self.assertEqual(xxhash.xxh3_64().name, 'XXH3_64')
127+
self.assertEqual(xxhash.xxh3_128().name, 'XXH3_128')
128+
129+
# ── digest / hexdigest ─────────────────────────────────────────
130+
131+
def test_digest(self):
132+
for algo in ('xxh32', 'xxh64', 'xxh3_64', 'xxh3_128'):
133+
obj = getattr(xxhash, algo)(self.data)
134+
d_fn = getattr(xxhash, f'{algo}_digest')
135+
self.assertEqual(obj.digest(), d_fn(self.data))
136+
self.assertIsInstance(obj.digest(), bytes)
137+
self.assertEqual(len(obj.digest()), obj.digest_size)
138+
139+
def test_hexdigest(self):
140+
for algo in ('xxh32', 'xxh64', 'xxh3_64', 'xxh3_128'):
141+
obj = getattr(xxhash, algo)(self.data)
142+
h_fn = getattr(xxhash, f'{algo}_hexdigest')
143+
self.assertEqual(obj.hexdigest(), h_fn(self.data))
144+
self.assertIsInstance(obj.hexdigest(), str)
145+
self.assertEqual(len(obj.hexdigest()), obj.digest_size * 2)
146+
147+
# ── update ─────────────────────────────────────────────────────
148+
149+
def test_update(self):
150+
for algo in ('xxh32', 'xxh64', 'xxh3_64', 'xxh3_128'):
151+
a = getattr(xxhash, algo)()
152+
a.update(self.data)
153+
b = getattr(xxhash, algo)(self.data)
154+
self.assertEqual(a.digest(), b.digest())
155+
156+
# ── copy ───────────────────────────────────────────────────────
157+
158+
def test_copy(self):
159+
for algo in ('xxh32', 'xxh64', 'xxh3_64', 'xxh3_128'):
160+
a = getattr(xxhash, algo)(self.data)
161+
b = a.copy()
162+
self.assertEqual(a.digest(), b.digest())
163+
b.update(b'more')
164+
self.assertNotEqual(a.digest(), b.digest())

0 commit comments

Comments
 (0)