Skip to content

Commit 0483362

Browse files
committed
refactor: the num bytes per token is now power of two
1 parent e3e9d67 commit 0483362

2 files changed

Lines changed: 12 additions & 2 deletions

File tree

src/modalities/dataloader/create_packed_data.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,17 @@ def _get_required_num_of_bytes_to_repr(int_to_get_repr: int) -> int:
8585
Returns:
8686
int: The number of bytes required to represent the integer.
8787
"""
88-
return math.ceil(math.log2(int_to_get_repr) / 8)
88+
# we currently only supoprt token sizes of 1, 2 and 4 bytes, as implemented here:
89+
# https://github.com/Modalities/modalities/blob/fix_char_bytes_indexation_mismatch/src/modalities/dataloader/dataset.py#L202
90+
num_bytes = math.ceil(math.log2(int_to_get_repr) / 8)
91+
if num_bytes == 1:
92+
return 1
93+
elif num_bytes == 2:
94+
return 2
95+
elif num_bytes <= 4:
96+
return 4
97+
else:
98+
raise ValueError("Currently only support token byte sizes of 1, 2, and 4.")
8999

90100
def _encoded_token_to_bytes(self, encoded_token: int) -> bytes:
91101
"""

tests/dataloader/test_packed_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ def test_continuously_packed_index(token_size_in_bytes: int, block_size: int, to
263263

264264
@pytest.mark.parametrize(
265265
"vocab_size, expected_num_bytes",
266-
[(254, 1), (255, 1), (256, 1), (257, 2), (65534, 2), (65535, 2), (65536, 2), (65537, 3)],
266+
[(254, 1), (255, 1), (256, 1), (257, 2), (65534, 2), (65535, 2), (65536, 2), (65537, 4), (65538, 4), (10000000, 4)],
267267
)
268268
def test__get_required_num_of_bytes_to_repr(vocab_size: int, expected_num_bytes: int):
269269
num_bytes = PackedDataGenerator._get_required_num_of_bytes_to_repr(int_to_get_repr=vocab_size)

0 commit comments

Comments
 (0)