Skip to content

Commit c149894

Browse files
authored
Merge pull request #31 from saxbophone/josh/26-reject-non-unique-encoding-alphabets
Reject non unique encoding alphabets and `None` being used in them
2 parents 3a04f12 + 5ac9214 commit c149894

File tree

4 files changed

+167
-2
lines changed

4 files changed

+167
-2
lines changed

basest/core/decode.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
)
66

77
from .encode import encode_raw
8-
from .utils import ints_to_symbols, symbols_to_ints
8+
from .utils import ints_to_symbols, symbols_to_ints, validate_symbol_tables
99

1010

1111
def decode_raw(input_base, output_base, input_ratio, output_ratio, input_data):
@@ -54,6 +54,12 @@ def decode(
5454
Assumes standard base64-style padding using the given input padding symbol,
5555
but can handle unpadded input just fine.
5656
"""
57+
# validate both symbol tables and the padding symbol before continuing
58+
validate_symbol_tables(
59+
input_symbol_table,
60+
input_padding,
61+
output_symbol_table
62+
)
5763
# create workon copy of input data and convert symbols to raw ints
5864
# NOTE: input symbol table here includes the padding character
5965
input_workon = symbols_to_ints(

basest/core/encode.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
absolute_import, division, print_function, unicode_literals
55
)
66

7-
from .utils import ints_to_symbols, symbols_to_ints
7+
from .utils import ints_to_symbols, symbols_to_ints, validate_symbol_tables
88

99

1010
def _nearest_length(input_length, input_ratio):
@@ -96,6 +96,12 @@ def encode(
9696
Uses standard base64-style padding if needed, using the given padding
9797
symbol.
9898
"""
99+
# validate both symbol tables and the padding symbol before continuing
100+
validate_symbol_tables(
101+
output_symbol_table,
102+
output_padding,
103+
input_symbol_table
104+
)
99105
# create workon copy of input data and convert symbols to raw ints
100106
input_workon = symbols_to_ints(input_data, input_symbol_table)
101107
# use encode_raw() to encode the data

basest/core/utils.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,38 @@ def symbols_to_ints(symbols, symbol_table):
1919
convert them to an iterable of ints and return this.
2020
"""
2121
return [symbol_table.index(s) for s in symbols]
22+
23+
24+
def symbol_table_is_unique(symbol_table, padding_symbol=None):
25+
"""
26+
Returns True if the given symbol table and padding symbol are unique,
27+
otherwise returns False.
28+
"""
29+
# simple way of checking if a list of hashables is unique
30+
if len(symbol_table) != len(set(symbol_table)):
31+
return False
32+
else:
33+
# otherwise, check that padding_symbol isn't in the symbol table
34+
# NOTE: this assumes that `None` is never in the symbol table
35+
return padding_symbol not in symbol_table
36+
37+
38+
def validate_symbol_tables(symbol_table, padding_symbol, other_symbol_table):
39+
"""
40+
Validates two symbol tables (the padding symbol being used alongside the
41+
first one).
42+
Raises ValueError if either of the symbol tables (or padding symbol) fail
43+
validation.
44+
"""
45+
# first check that they all do not contain None
46+
if None in (symbol_table + [padding_symbol] + other_symbol_table):
47+
raise ValueError(
48+
'None cannot be used in symbol tables nor for padding'
49+
)
50+
# if that check passes, validate tables (and padding) for uniqueness
51+
# the padding symbol is evaluated with the first symbol table
52+
elif (
53+
(not symbol_table_is_unique(symbol_table, padding_symbol)) or
54+
(not symbol_table_is_unique(other_symbol_table))
55+
):
56+
raise ValueError('Unique symbol tables required')

tests/core/test_encode_decode.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,65 @@ def test_encode(
127127

128128
self.assertEqual(output_data, expected_output_data)
129129

130+
@data(
131+
(list('abcd'), list('ccccc'), '1'),
132+
(list('!!!!!'), list('abcdef'), '#')
133+
)
134+
@unpack
135+
def test_encode_rejects_non_unique_symbol_tables(
136+
self,
137+
input_symbol_table,
138+
output_symbol_table,
139+
padding_symbol
140+
):
141+
"""
142+
When a non-unique input or output symbol table is passed to encode(),
143+
ValueError should be raised.
144+
"""
145+
with self.assertRaises(ValueError):
146+
encode(
147+
len(input_symbol_table), input_symbol_table,
148+
len(output_symbol_table), output_symbol_table,
149+
padding_symbol,
150+
1, 1,
151+
[]
152+
)
153+
154+
def test_encode_rejects_output_symbol_table_containing_padding_symbol(
155+
self
156+
):
157+
"""
158+
When the output symbol table passed to encode() contains the padding
159+
symbol, ValueError should be raised.
160+
"""
161+
with self.assertRaises(ValueError):
162+
encode(1, ['a'], 1, ['b'], 'b', 1, 1, [])
163+
164+
@data(
165+
(list('abcd'), list('efghijk'), None),
166+
(list('1234'), [1, 2, 3, None], '#'),
167+
([None, 2, 3, 4], list('cabuges'), '#')
168+
)
169+
@unpack
170+
def test_encode_rejects_none_used_in_symbol_tables_and_padding(
171+
self,
172+
input_symbol_table,
173+
output_symbol_table,
174+
padding_symbol
175+
):
176+
"""
177+
When any of the symbol tables or the padding symbol passed to encode()
178+
are or contain None, ValueError should be raised.
179+
"""
180+
with self.assertRaises(ValueError):
181+
encode(
182+
len(input_symbol_table), input_symbol_table,
183+
len(output_symbol_table), output_symbol_table,
184+
padding_symbol,
185+
1, 1,
186+
[]
187+
)
188+
130189
@data(
131190
# Base-64, using most common alphabet - no padding
132191
(
@@ -222,6 +281,65 @@ def test_decode(
222281

223282
self.assertEqual(output_data, expected_output_data)
224283

284+
@data(
285+
(list('abcd'), list('ccccc'), '1'),
286+
(list('!!!!!'), list('abcdef'), '#')
287+
)
288+
@unpack
289+
def test_decode_rejects_non_unique_symbol_tables(
290+
self,
291+
input_symbol_table,
292+
output_symbol_table,
293+
padding_symbol
294+
):
295+
"""
296+
When a non-unique input or output symbol table is passed to decode(),
297+
ValueError should be raised.
298+
"""
299+
with self.assertRaises(ValueError):
300+
decode(
301+
len(input_symbol_table), input_symbol_table,
302+
padding_symbol,
303+
len(output_symbol_table), output_symbol_table,
304+
1, 1,
305+
[]
306+
)
307+
308+
def test_decode_rejects_input_symbol_table_containing_padding_symbol(
309+
self
310+
):
311+
"""
312+
When the input symbol table passed to decode() contains the padding
313+
symbol, ValueError should be raised.
314+
"""
315+
with self.assertRaises(ValueError):
316+
decode(1, ['a'], 'a', 1, ['b'], 1, 1, [])
317+
318+
@data(
319+
(list('abcd'), list('efghijk'), None),
320+
(list('1234'), [1, 2, 3, None], '#'),
321+
([None, 2, 3, 4], list('cabuges'), '#')
322+
)
323+
@unpack
324+
def test_decode_rejects_none_used_in_symbol_tables_and_padding(
325+
self,
326+
input_symbol_table,
327+
output_symbol_table,
328+
padding_symbol
329+
):
330+
"""
331+
When any of the symbol tables or the padding symbol passed to decode()
332+
are or contain None, ValueError should be raised.
333+
"""
334+
with self.assertRaises(ValueError):
335+
decode(
336+
len(input_symbol_table), input_symbol_table,
337+
padding_symbol,
338+
len(output_symbol_table), output_symbol_table,
339+
1, 1,
340+
[]
341+
)
342+
225343
@data(
226344
# Base-64, using most common alphabet with no padding needed
227345
(

0 commit comments

Comments
 (0)