Skip to content

Commit 80c08fc

Browse files
authored
Merge pull request #5 from nebkat/feat/byte-array
feat: Binary optimized arrays (draft3)
2 parents 2c84f15 + 670b0fb commit 80c08fc

11 files changed

Lines changed: 117 additions & 71 deletions

File tree

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,8 @@ an extended Universal Binary JSON (UBJSON) Specification Draft-12 by adding
142142
the below new features:
143143

144144
* BJData adds 4 new numeric data types: `uint16 [u]`, `uint32 [m]`, `uint64 [M]` and `float16 [h]`
145+
* BJData adds a dedicated byte data type used in optimized array containers for binary data
145146
* BJData supports an optimized ND array container
146147
* BJData does not convert NaN/Inf/-Inf to `null`
147148
* BJData uses little-Endian as the default integer/floating-point numbers while UBJSON uses big-Endian
148-
* BJData only permits non-zero-fixed-length data types (`UiuImlMLhdDC`) in strongly-typed array/object containers
149+
* BJData only permits non-zero-fixed-length data types (`UiuImlMLhdDCB`) in strongly-typed array/object containers

bjdata/decoder.py

Lines changed: 43 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -22,20 +22,21 @@
2222
from functools import reduce
2323

2424
from .compat import raise_from, intern_unicode
25-
from .markers import (TYPE_NONE, TYPE_NULL, TYPE_NOOP, TYPE_BOOL_TRUE, TYPE_BOOL_FALSE, TYPE_INT8, TYPE_UINT8,
25+
from .markers import (TYPE_NONE, TYPE_NULL, TYPE_NOOP, TYPE_BOOL_TRUE, TYPE_BOOL_FALSE, TYPE_BYTE, TYPE_INT8, TYPE_UINT8,
2626
TYPE_INT16, TYPE_INT32, TYPE_INT64, TYPE_FLOAT32, TYPE_FLOAT64, TYPE_HIGH_PREC, TYPE_CHAR,
2727
TYPE_UINT16, TYPE_UINT32, TYPE_UINT64, TYPE_FLOAT16,
2828
TYPE_STRING, OBJECT_START, OBJECT_END, ARRAY_START, ARRAY_END, CONTAINER_TYPE, CONTAINER_COUNT)
2929
from numpy import array as ndarray, dtype as npdtype, frombuffer as buffer2numpy, half as halfprec
3030
from array import array as typedarray
3131

32-
__TYPES = frozenset((TYPE_NULL, TYPE_BOOL_TRUE, TYPE_BOOL_FALSE, TYPE_INT8, TYPE_UINT8, TYPE_INT16, TYPE_INT32,
33-
TYPE_INT64, TYPE_FLOAT32, TYPE_FLOAT64, TYPE_UINT16, TYPE_UINT32, TYPE_UINT64, TYPE_FLOAT16,
34-
TYPE_HIGH_PREC, TYPE_CHAR, TYPE_STRING, ARRAY_START, OBJECT_START))
32+
__TYPES = frozenset((TYPE_NULL, TYPE_BOOL_TRUE, TYPE_BOOL_FALSE, TYPE_BYTE, TYPE_INT8, TYPE_UINT8, TYPE_INT16,
33+
TYPE_INT32, TYPE_INT64, TYPE_FLOAT32, TYPE_FLOAT64, TYPE_UINT16, TYPE_UINT32, TYPE_UINT64,
34+
TYPE_FLOAT16, TYPE_HIGH_PREC, TYPE_CHAR, TYPE_STRING, ARRAY_START, OBJECT_START))
3535
__TYPES_NO_DATA = frozenset((TYPE_NULL, TYPE_BOOL_FALSE, TYPE_BOOL_TRUE))
36-
__TYPES_INT = frozenset((TYPE_INT8, TYPE_UINT8, TYPE_INT16, TYPE_INT32, TYPE_INT64, TYPE_UINT16, TYPE_UINT32, TYPE_UINT64))
37-
__TYPES_FIXLEN = frozenset((TYPE_INT8, TYPE_UINT8, TYPE_INT16, TYPE_INT32, TYPE_INT64, TYPE_UINT16, TYPE_UINT32, TYPE_UINT64,
38-
TYPE_FLOAT16, TYPE_FLOAT32, TYPE_FLOAT64, TYPE_CHAR))
36+
__TYPES_INT = frozenset((TYPE_BYTE, TYPE_INT8, TYPE_UINT8, TYPE_INT16, TYPE_INT32, TYPE_INT64, TYPE_UINT16,
37+
TYPE_UINT32, TYPE_UINT64))
38+
__TYPES_FIXLEN = frozenset((TYPE_BYTE, TYPE_INT8, TYPE_UINT8, TYPE_INT16, TYPE_INT32, TYPE_INT64, TYPE_UINT16, TYPE_UINT32,
39+
TYPE_UINT64, TYPE_FLOAT16, TYPE_FLOAT32, TYPE_FLOAT64, TYPE_CHAR))
3940

4041
__SMALL_INTS_DECODED = [{pack('>b', i): i for i in range(-128, 128)}, {pack('<b', i): i for i in range(-128, 128)}]
4142
__SMALL_UINTS_DECODED = [{pack('>B', i): i for i in range(256)}, {pack('<B', i): i for i in range(256)}]
@@ -49,7 +50,8 @@
4950
__UNPACK_FLOAT32 = [Struct('>f').unpack, Struct('<f').unpack]
5051
__UNPACK_FLOAT64 = [Struct('>d').unpack, Struct('<d').unpack]
5152

52-
__DTYPE_MAP = { TYPE_INT8: 'b',
53+
__DTYPE_MAP = { TYPE_BYTE: 'B',
54+
TYPE_INT8: 'b',
5355
TYPE_UINT8: 'B',
5456
TYPE_INT16: 'h',
5557
TYPE_UINT16: 'H',
@@ -62,7 +64,8 @@
6264
TYPE_FLOAT64: 'd',
6365
TYPE_CHAR: 'c'}
6466

65-
__DTYPELEN_MAP={ TYPE_INT8: 1,
67+
__DTYPELEN_MAP={ TYPE_BYTE: 1,
68+
TYPE_INT8: 1,
6669
TYPE_UINT8: 1,
6770
TYPE_INT16: 2,
6871
TYPE_UINT16: 2,
@@ -114,6 +117,11 @@ def __decode_int_non_negative(fp_read, marker, le=1):
114117
raise DecoderException('Negative count/length unexpected')
115118
return value
116119

120+
def __decode_byte(fp_read, marker, le=1):
121+
try:
122+
return __SMALL_UINTS_DECODED[le][fp_read(1)]
123+
except KeyError as ex:
124+
raise_from(DecoderException('Failed to unpack byte'), ex)
117125

118126
def __decode_int8(fp_read, marker, le=1):
119127
try:
@@ -227,6 +235,7 @@ def __decode_object_key(fp_read, marker, intern_object_keys, le=1):
227235
__METHOD_MAP = {TYPE_NULL: (lambda _, __, ___: None),
228236
TYPE_BOOL_TRUE: (lambda _, __, ___: True),
229237
TYPE_BOOL_FALSE: (lambda _, __, ___: False),
238+
TYPE_BYTE: __decode_byte,
230239
TYPE_INT8: __decode_int8,
231240
TYPE_UINT8: __decode_uint8,
232241
TYPE_INT16: __decode_int16,
@@ -248,7 +257,7 @@ def prodlist(mylist):
248257
result = result * x
249258
return result
250259

251-
def __get_container_params(fp_read, in_mapping, no_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle):
260+
def __get_container_params(fp_read, in_mapping, no_bytes, uint8_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle):
252261
marker = fp_read(1)
253262
dims = []
254263
if marker == CONTAINER_TYPE:
@@ -262,15 +271,15 @@ def __get_container_params(fp_read, in_mapping, no_bytes, object_hook, object_pa
262271
if marker == CONTAINER_COUNT:
263272
marker = fp_read(1)
264273
if marker == ARRAY_START:
265-
dims = __decode_array(fp_read, no_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle)
274+
dims = __decode_array(fp_read, no_bytes, uint8_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle)
266275
count = prodlist(dims)
267276
else:
268277
count = __decode_int_non_negative(fp_read, marker, islittle)
269278
counting = True
270279

271280
# special cases (no data (None or bool) / bytes array) will be handled in calling functions
272281
if not (type_ in __TYPES_NO_DATA or
273-
(type_ == TYPE_UINT8 and not in_mapping and not no_bytes)):
282+
(type_ == (TYPE_UINT8 if uint8_bytes else TYPE_BYTE) and not in_mapping and not no_bytes)):
274283
# Reading ahead is just to capture type, which will not exist if type is fixed
275284
marker = fp_read(1) if (in_mapping or type_ == TYPE_NONE) else type_
276285

@@ -283,9 +292,9 @@ def __get_container_params(fp_read, in_mapping, no_bytes, object_hook, object_pa
283292
return marker, counting, count, type_, dims
284293

285294

286-
def __decode_object(fp_read, no_bytes, object_hook, object_pairs_hook, # pylint: disable=too-many-branches
295+
def __decode_object(fp_read, no_bytes, uint8_bytes, object_hook, object_pairs_hook, # pylint: disable=too-many-branches
287296
intern_object_keys, islittle):
288-
marker, counting, count, type_, dims = __get_container_params(fp_read, True, no_bytes,object_hook, object_pairs_hook,intern_object_keys, islittle)
297+
marker, counting, count, type_, dims = __get_container_params(fp_read, True, no_bytes, uint8_bytes, object_hook, object_pairs_hook,intern_object_keys, islittle)
289298
has_pairs_hook = object_pairs_hook is not None
290299
obj = [] if has_pairs_hook else {}
291300

@@ -323,9 +332,9 @@ def __decode_object(fp_read, no_bytes, object_hook, object_pairs_hook, # pylint
323332
# handle outside above except (on KeyError) so do not have unfriendly "exception within except" backtrace
324333
if not handled:
325334
if marker == ARRAY_START:
326-
value = __decode_array(fp_read, no_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle)
335+
value = __decode_array(fp_read, no_bytes, uint8_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle)
327336
elif marker == OBJECT_START:
328-
value = __decode_object(fp_read, no_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle)
337+
value = __decode_object(fp_read, no_bytes, uint8_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle)
329338
else:
330339
raise DecoderException('Invalid marker within object')
331340

@@ -341,15 +350,15 @@ def __decode_object(fp_read, no_bytes, object_hook, object_pairs_hook, # pylint
341350
return object_pairs_hook(obj) if has_pairs_hook else object_hook(obj)
342351

343352

344-
def __decode_array(fp_read, no_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle):
345-
marker, counting, count, type_, dims = __get_container_params(fp_read, False, no_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle)
353+
def __decode_array(fp_read, no_bytes, uint8_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle):
354+
marker, counting, count, type_, dims = __get_container_params(fp_read, False, no_bytes, uint8_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle)
346355

347356
# special case - no data (None or bool)
348357
if type_ in __TYPES_NO_DATA:
349358
return [__METHOD_MAP[type_](fp_read, type_, islittle)] * count
350359

351360
# special case - bytes array
352-
if type_ == TYPE_UINT8 and not no_bytes and len(dims)==0:
361+
if type_ == (TYPE_UINT8 if uint8_bytes else TYPE_BYTE) and not no_bytes and len(dims)==0:
353362
container = fp_read(count)
354363
if len(container) < count:
355364
raise DecoderException('Container bytes array too short')
@@ -388,9 +397,9 @@ def __decode_array(fp_read, no_bytes, object_hook, object_pairs_hook, intern_obj
388397
# handle outside above except (on KeyError) so do not have unfriendly "exception within except" backtrace
389398
if not handled:
390399
if marker == ARRAY_START:
391-
value = __decode_array(fp_read, no_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle)
400+
value = __decode_array(fp_read, no_bytes, uint8_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle)
392401
elif marker == OBJECT_START:
393-
value = __decode_object(fp_read, no_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle)
402+
value = __decode_object(fp_read, no_bytes, uint8_bytes, object_hook, object_pairs_hook, intern_object_keys, islittle)
394403
else:
395404
raise DecoderException('Invalid marker within array')
396405

@@ -411,14 +420,18 @@ def __object_hook_noop(obj):
411420
return obj
412421

413422

414-
def load(fp, no_bytes=False, object_hook=None, object_pairs_hook=None, intern_object_keys=False, islittle=True):
423+
def load(fp, no_bytes=False, uint8_bytes=False, object_hook=None, object_pairs_hook=None, intern_object_keys=False, islittle=True):
415424
"""Decodes and returns BJData/UBJSON from the given file-like object
416425
417426
Args:
418427
fp: read([size])-able object
419-
no_bytes (bool): If set, typed UBJSON arrays (uint8) will not be
428+
no_bytes (bool): If set, typed UBJSON arrays (byte) will not be
420429
converted to a bytes instance and instead treated like
421430
any other array (i.e. result in a list).
431+
uint8_bytes (bool): If set, typed UBJSON arrays (uint8) will be
432+
converted to a bytes instance instead of being
433+
treated as an array (for UBJSON & BJData Draft 2).
434+
Ignored if no_bytes is set.
422435
object_hook (callable): Called with the result of any object literal
423436
decoded (instead of dict).
424437
object_pairs_hook (callable): Called with the result of any object
@@ -432,7 +445,7 @@ def load(fp, no_bytes=False, object_hook=None, object_pairs_hook=None, intern_ob
432445
to unicode) and wil be ignored.
433446
islittle (1 or 0): default is 1 for little-endian for all numerics (for
434447
BJData Draft 2), change to 0 to use big-endian
435-
(for UBJSON for BJData Draft 1)
448+
(for UBJSON & BJData Draft 1)
436449
437450
Returns:
438451
Decoded object
@@ -454,13 +467,13 @@ def load(fp, no_bytes=False, object_hook=None, object_pairs_hook=None, intern_ob
454467
| | (2) unicode |
455468
+----------------------------------+---------------+
456469
| uint8, int8, int16, int32, int64 | (3) int |
457-
| | (2) int, long |
470+
| byte | (2) int, long |
458471
+----------------------------------+---------------+
459472
| float32, float64 | float |
460473
+----------------------------------+---------------+
461474
| high_precision | Decimal |
462475
+----------------------------------+---------------+
463-
| array (typed, uint8) | (3) bytes |
476+
| array (typed, byte) | (3) bytes |
464477
| | (2) str |
465478
+----------------------------------+---------------+
466479
| true | True |
@@ -489,9 +502,9 @@ def load(fp, no_bytes=False, object_hook=None, object_pairs_hook=None, intern_ob
489502
except KeyError:
490503
pass
491504
if marker == ARRAY_START:
492-
newobj.append(__decode_array(fp_read, bool(no_bytes), object_hook, object_pairs_hook, intern_object_keys, islittle))
505+
newobj.append(__decode_array(fp_read, bool(no_bytes), bool(uint8_bytes), object_hook, object_pairs_hook, intern_object_keys, islittle))
493506
if marker == OBJECT_START:
494-
newobj.append(__decode_object(fp_read, bool(no_bytes), object_hook, object_pairs_hook, intern_object_keys, islittle))
507+
newobj.append(__decode_object(fp_read, bool(no_bytes), bool(uint8_bytes), object_hook, object_pairs_hook, intern_object_keys, islittle))
495508
raise DecoderException('Invalid marker')
496509
except DecoderException as ex:
497510
if len(newobj)>0:
@@ -505,9 +518,9 @@ def load(fp, no_bytes=False, object_hook=None, object_pairs_hook=None, intern_ob
505518

506519
return newobj;
507520

508-
def loadb(chars, no_bytes=False, object_hook=None, object_pairs_hook=None, intern_object_keys=False, islittle=True):
521+
def loadb(chars, no_bytes=False, uint8_bytes=False, object_hook=None, object_pairs_hook=None, intern_object_keys=False, islittle=True):
509522
"""Decodes and returns BJData/UBJSON from the given bytes or bytesarray object. See
510523
load() for available arguments."""
511524
with BytesIO(chars) as fp:
512-
return load(fp, no_bytes=no_bytes, object_hook=object_hook, object_pairs_hook=object_pairs_hook,
525+
return load(fp, no_bytes=no_bytes, uint8_bytes=uint8_bytes, object_hook=object_hook, object_pairs_hook=object_pairs_hook,
513526
intern_object_keys=intern_object_keys, islittle=islittle)

0 commit comments

Comments
 (0)