Skip to content

Commit db87d55

Browse files
committed
feat: hashlib compatibility
- algorithms_available / algorithms_guaranteed: module attributes - Str rejection: TypeError('Strings must be encoded before hashing') - data= keyword argument (renamed from input=) - Buffer types: bytes, bytearray, memoryview, array, mmap, PickleBuffer, ctypes - 13 hashlib compat tests + 11 buffer input tests - 3.8.0.dev5
1 parent b24c873 commit db87d55

10 files changed

Lines changed: 271 additions & 161 deletions

src/_xxhash.c

Lines changed: 24 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -45,28 +45,18 @@
4545
#define XXH128_BLOCKSIZE 64
4646

4747

48-
/* Get a buffer from an object, or UTF-8 encode if it's a str.
49-
* On success, *owner is set to the object that owns the buffer
50-
* (NULL if the arg itself supports the buffer protocol).
51-
* Caller must PyBuffer_Release(buf) and Py_XDECREF(*owner). */
48+
/* Get a buffer from an object. Rejects str with hashlib-compatible error. */
5249
#ifndef Py_ALWAYS_INLINE
5350
# define Py_ALWAYS_INLINE
5451
#endif
5552

5653
static Py_ALWAYS_INLINE int
5754
_get_buffer_or_str(PyObject *obj, Py_buffer *buf, PyObject **owner)
5855
{
59-
/* Check str first to avoid a guaranteed-failing PyObject_GetBuffer call
60-
* and the resulting set/clear of a TypeError. */
6156
if (PyUnicode_Check(obj)) {
62-
*owner = PyUnicode_AsUTF8String(obj);
63-
if (*owner == NULL)
64-
return -1;
65-
if (PyObject_GetBuffer(*owner, buf, PyBUF_SIMPLE) < 0) {
66-
Py_DECREF(*owner);
67-
return -1;
68-
}
69-
return 0;
57+
PyErr_SetString(PyExc_TypeError,
58+
"Strings must be encoded before hashing");
59+
return -1;
7060
}
7161
if (PyObject_GetBuffer(obj, buf, PyBUF_SIMPLE) < 0)
7262
return -1;
@@ -75,7 +65,7 @@ _get_buffer_or_str(PyObject *obj, Py_buffer *buf, PyObject **owner)
7565
}
7666

7767
/* Parse input buffer and optional seed from fastcall arguments.
78-
* Handles: positional 'input', positional 'seed', keyword 'input',
68+
* Handles: positional 'data', positional 'seed', keyword 'data',
7969
* keyword 'seed', with proper error reporting for unknown keywords,
8070
* duplicate arguments, and too many positional args.
8171
* Returns 0 on success, -1 on error with exception set. */
@@ -120,10 +110,10 @@ _parse_fastcall_args(PyObject *const *args, Py_ssize_t nargs,
120110
PyObject *key = PyTuple_GET_ITEM(kwnames, i);
121111
PyObject *val = args[nargs + i];
122112

123-
if (PyUnicode_CompareWithASCIIString(key, "input") == 0) {
113+
if (PyUnicode_CompareWithASCIIString(key, "data") == 0) {
124114
if (input_found) {
125115
PyErr_Format(PyExc_TypeError,
126-
"%s() got multiple values for argument 'input'",
116+
"%s() got multiple values for argument 'data'",
127117
funcname);
128118
goto error;
129119
}
@@ -152,7 +142,7 @@ _parse_fastcall_args(PyObject *const *args, Py_ssize_t nargs,
152142

153143
if (!input_found && input_required) {
154144
PyErr_Format(PyExc_TypeError,
155-
"%s() missing required argument 'input'", funcname);
145+
"%s() missing required argument 'data'", funcname);
156146
return -1;
157147
}
158148
return 0;
@@ -559,7 +549,7 @@ static PyObject *PYXXH32_new(PyTypeObject *type, PyObject *args, PyObject *kwarg
559549
static int PYXXH32_init(PYXXH32Object *self, PyObject *args, PyObject *kwargs)
560550
{
561551
XXH32_hash_t seed = 0;
562-
char *keywords[] = {"input", "seed", NULL};
552+
char *keywords[] = {"data", "seed", NULL};
563553
Py_buffer buf;
564554

565555
buf.buf = buf.obj = NULL;
@@ -580,8 +570,8 @@ static int PYXXH32_init(PYXXH32Object *self, PyObject *args, PyObject *kwargs)
580570

581571
PyDoc_STRVAR(
582572
PYXXH32_update_doc,
583-
"update (input)\n\n"
584-
"Update the xxh32 object with the string input. Repeated calls are\n"
573+
"update (data)\n\n"
574+
"Update the xxh32 object with the string data. Repeated calls are\n"
585575
"equivalent to a single call with the concatenation of all the arguments.");
586576

587577
static PyObject *PYXXH32_update(PYXXH32Object *self, PyObject *args)
@@ -768,7 +758,7 @@ PyDoc_STRVAR(
768758
"\n"
769759
"Methods:\n"
770760
"\n"
771-
"update(input) -- updates the current digest with the provided string.\n"
761+
"update(data) -- updates the current digest with the provided data.\n"
772762
"digest() -- return the current digest value\n"
773763
"hexdigest() -- return the current digest as a string of hexadecimal digits\n"
774764
"intdigest() -- return the current digest as an integer\n"
@@ -915,7 +905,7 @@ static PyObject *PYXXH64_new(PyTypeObject *type, PyObject *args, PyObject *kwarg
915905
static int PYXXH64_init(PYXXH64Object *self, PyObject *args, PyObject *kwargs)
916906
{
917907
XXH64_hash_t seed = 0;
918-
char *keywords[] = {"input", "seed", NULL};
908+
char *keywords[] = {"data", "seed", NULL};
919909
Py_buffer buf;
920910

921911
buf.buf = buf.obj = NULL;
@@ -936,8 +926,8 @@ static int PYXXH64_init(PYXXH64Object *self, PyObject *args, PyObject *kwargs)
936926

937927
PyDoc_STRVAR(
938928
PYXXH64_update_doc,
939-
"update (input)\n\n"
940-
"Update the xxh64 object with the string input. Repeated calls are\n"
929+
"update (data)\n\n"
930+
"Update the xxh64 object with the string data. Repeated calls are\n"
941931
"equivalent to a single call with the concatenation of all the arguments.");
942932

943933
static PyObject *PYXXH64_update(PYXXH64Object *self, PyObject *args)
@@ -1124,7 +1114,7 @@ PyDoc_STRVAR(
11241114
"\n"
11251115
"Methods:\n"
11261116
"\n"
1127-
"update(input) -- updates the current digest with an additional string\n"
1117+
"update(data) -- updates the current digest with additional data\n"
11281118
"digest() -- return the current digest value\n"
11291119
"hexdigest() -- return the current digest as a string of hexadecimal digits\n"
11301120
"intdigest() -- return the current digest as an integer\n"
@@ -1270,7 +1260,7 @@ static PyObject *PYXXH3_64_new(PyTypeObject *type, PyObject *args, PyObject *kwa
12701260
static int PYXXH3_64_init(PYXXH3_64Object *self, PyObject *args, PyObject *kwargs)
12711261
{
12721262
XXH64_hash_t seed = 0;
1273-
char *keywords[] = {"input", "seed", NULL};
1263+
char *keywords[] = {"data", "seed", NULL};
12741264
Py_buffer buf;
12751265

12761266
buf.buf = buf.obj = NULL;
@@ -1291,8 +1281,8 @@ static int PYXXH3_64_init(PYXXH3_64Object *self, PyObject *args, PyObject *kwarg
12911281

12921282
PyDoc_STRVAR(
12931283
PYXXH3_64_update_doc,
1294-
"update (input)\n\n"
1295-
"Update the xxh3_64 object with the string input. Repeated calls are\n"
1284+
"update (data)\n\n"
1285+
"Update the xxh3_64 object with the string data. Repeated calls are\n"
12961286
"equivalent to a single call with the concatenation of all the arguments.");
12971287

12981288
static PyObject *PYXXH3_64_update(PYXXH3_64Object *self, PyObject *args)
@@ -1487,7 +1477,7 @@ PyDoc_STRVAR(
14871477
"\n"
14881478
"Methods:\n"
14891479
"\n"
1490-
"update(input) -- updates the current digest with an additional string\n"
1480+
"update(data) -- updates the current digest with additional data\n"
14911481
"digest() -- return the current digest value\n"
14921482
"hexdigest() -- return the current digest as a string of hexadecimal digits\n"
14931483
"intdigest() -- return the current digest as an integer\n"
@@ -1634,7 +1624,7 @@ static PyObject *PYXXH3_128_new(PyTypeObject *type, PyObject *args, PyObject *kw
16341624
static int PYXXH3_128_init(PYXXH3_128Object *self, PyObject *args, PyObject *kwargs)
16351625
{
16361626
XXH64_hash_t seed = 0;
1637-
char *keywords[] = {"input", "seed", NULL};
1627+
char *keywords[] = {"data", "seed", NULL};
16381628
Py_buffer buf;
16391629

16401630
buf.buf = buf.obj = NULL;
@@ -1655,8 +1645,8 @@ static int PYXXH3_128_init(PYXXH3_128Object *self, PyObject *args, PyObject *kwa
16551645

16561646
PyDoc_STRVAR(
16571647
PYXXH3_128_update_doc,
1658-
"update (input)\n\n"
1659-
"Update the xxh3_128 object with the string input. Repeated calls are\n"
1648+
"update (data)\n\n"
1649+
"Update the xxh3_128 object with the string data. Repeated calls are\n"
16601650
"equivalent to a single call with the concatenation of all the arguments.");
16611651

16621652
static PyObject *PYXXH3_128_update(PYXXH3_128Object *self, PyObject *args)
@@ -1868,7 +1858,7 @@ PyDoc_STRVAR(
18681858
"\n"
18691859
"Methods:\n"
18701860
"\n"
1871-
"update(input) -- updates the current digest with an additional string\n"
1861+
"update(data) -- updates the current digest with additional data\n"
18721862
"digest() -- return the current digest value\n"
18731863
"hexdigest() -- return the current digest as a string of hexadecimal digits\n"
18741864
"intdigest() -- return the current digest as an integer\n"

tests/test_benchmark.py

Lines changed: 5 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -90,42 +90,17 @@ def test_xxh64_hexdigest_5b():
9090
xxhash.xxh64_hexdigest(DATA_5B)
9191

9292

93-
# ── str input (tests _get_buffer_or_str UTF-8 encoding path) ────────
94-
95-
DATA_STR = "hello world"
96-
97-
98-
@pytest.mark.benchmark
99-
def test_xxh32_intdigest_str():
100-
xxhash.xxh32_intdigest(DATA_STR)
101-
102-
103-
@pytest.mark.benchmark
104-
def test_xxh64_intdigest_str():
105-
xxhash.xxh64_intdigest(DATA_STR)
106-
107-
108-
@pytest.mark.benchmark
109-
def test_xxh3_64_intdigest_str():
110-
xxhash.xxh3_64_intdigest(DATA_STR)
111-
112-
113-
@pytest.mark.benchmark
114-
def test_xxh3_128_intdigest_str():
115-
xxhash.xxh3_128_intdigest(DATA_STR)
116-
117-
11893
# ── type constructor (tests tp_vectorcall) ──────────────────────────
11994

12095

12196
@pytest.mark.benchmark
12297
def test_xxh32_ctor():
123-
xxhash.xxh32(DATA_STR)
98+
xxhash.xxh32(DATA_5B)
12499

125100

126101
@pytest.mark.benchmark
127102
def test_xxh32_ctor_seed():
128-
xxhash.xxh32(DATA_STR, seed=SEED_32)
103+
xxhash.xxh32(DATA_5B, seed=SEED_32)
129104

130105

131106
@pytest.mark.benchmark
@@ -135,17 +110,17 @@ def test_xxh32_ctor_empty():
135110

136111
@pytest.mark.benchmark
137112
def test_xxh64_ctor():
138-
xxhash.xxh64(DATA_STR, seed=SEED_64)
113+
xxhash.xxh64(DATA_5B, seed=SEED_64)
139114

140115

141116
@pytest.mark.benchmark
142117
def test_xxh3_64_ctor():
143-
xxhash.xxh3_64(DATA_STR, seed=SEED_64)
118+
xxhash.xxh3_64(DATA_5B, seed=SEED_64)
144119

145120

146121
@pytest.mark.benchmark
147122
def test_xxh3_128_ctor():
148-
xxhash.xxh3_128(DATA_STR, seed=SEED_64)
123+
xxhash.xxh3_128(DATA_5B, seed=SEED_64)
149124

150125

151126
# ── 2MB throughput: hashing dominates, call overhead negligible ─────

tests/test_fastcall.py

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,12 @@ def test_input_bytes(self):
3636
self._check(a, self.data)
3737

3838
def test_input_str(self):
39+
"""hashlib compatibility: str raises TypeError."""
3940
s = self.data.decode()
4041
for a in self.algorithms:
41-
self._check(a, s)
42+
for fn in self._funcs(a):
43+
with self.assertRaises(TypeError):
44+
fn(s)
4245

4346
def test_input_empty(self):
4447
for a in self.algorithms:
@@ -64,13 +67,13 @@ def test_positional_seed_xxh3_128(self):
6467

6568
# ── keyword input ─────────────────────────────────────────────
6669

67-
def test_keyword_input(self):
70+
def test_keyword_data(self):
6871
for a in self.algorithms:
69-
self._check(a, input=self.data)
72+
self._check(a, data=self.data)
7073

71-
def test_keyword_input_and_seed(self):
74+
def test_keyword_data_and_seed(self):
7275
for a in self.algorithms:
73-
self._check(a, input=self.data, seed=42)
76+
self._check(a, data=self.data, seed=42)
7477

7578
# ── keyword seed (with positional input) ──────────────────────
7679

@@ -105,6 +108,33 @@ def test_input_array(self):
105108
for a in self.algorithms:
106109
self._check(a, array.array('B', self.data))
107110

111+
def test_input_mmap(self):
112+
import mmap, tempfile, os
113+
with tempfile.NamedTemporaryFile(delete=False) as f:
114+
f.write(self.data)
115+
f.flush()
116+
try:
117+
with open(f.name, 'rb') as f2:
118+
with mmap.mmap(f2.fileno(), 0, access=mmap.ACCESS_READ) as m:
119+
for a in self.algorithms:
120+
self._check(a, m)
121+
finally:
122+
os.unlink(f.name)
123+
124+
def test_input_pickle_buffer(self):
125+
try:
126+
from pickle import PickleBuffer
127+
except ImportError:
128+
raise self.skipTest('PickleBuffer not available')
129+
for a in self.algorithms:
130+
self._check(a, PickleBuffer(self.data))
131+
132+
def test_input_ctypes(self):
133+
import ctypes
134+
buf = (ctypes.c_char * len(self.data)).from_buffer_copy(self.data)
135+
for a in self.algorithms:
136+
self._check(a, buf)
137+
108138

109139
class TestFastcallErrors(unittest.TestCase):
110140
"""Invalid argument passing: all error cases."""
@@ -141,16 +171,17 @@ def test_too_many_positional(self):
141171

142172
# ── unknown keyword ───────────────────────────────────────────
143173

144-
def test_unknown_keyword(self):
145-
self._assert_all_raise(TypeError, self.data, bad=1)
174+
def test_unknown_keyword_input(self):
175+
"""Old 'input' keyword is now unknown — was renamed to 'data'."""
176+
self._assert_all_raise(TypeError, input=self.data)
146177

147-
def test_unknown_keyword_input_kw(self):
148-
self._assert_all_raise(TypeError, input=self.data, bad=1)
178+
def test_unknown_keyword_data_kw(self):
179+
self._assert_all_raise(TypeError, data=self.data, bad=1)
149180

150181
# ── duplicate arguments ───────────────────────────────────────
151182

152183
def test_duplicate_input(self):
153-
self._assert_all_raise(TypeError, self.data, input=self.data)
184+
self._assert_all_raise(TypeError, self.data, data=self.data)
154185

155186
def test_duplicate_seed(self):
156187
self._assert_all_raise(TypeError, self.data, 0, seed=1)
@@ -164,15 +195,15 @@ def test_invalid_seed_keyword(self):
164195
self._assert_all_raise(TypeError, self.data, seed='bad')
165196

166197
def test_invalid_seed_with_input_kw(self):
167-
self._assert_all_raise(TypeError, input=self.data, seed='bad')
198+
self._assert_all_raise(TypeError, data=self.data, seed='bad')
168199

169200
# ── invalid input type (not str, not buffer) ──────────────────
170201

171202
def test_input_not_bytes_or_str(self):
172203
self._assert_all_raise(TypeError, 12345)
173204

174205
def test_input_not_bytes_or_str_kw(self):
175-
self._assert_all_raise(TypeError, input=12345)
206+
self._assert_all_raise(TypeError, data=12345)
176207

177208

178209
class TestFastcallSeedOverflow(unittest.TestCase):

0 commit comments

Comments
 (0)