refactor: the num bytes per token is now power of two

le1nux · le1nux · commit 0483362abac9 · 2024-12-13T15:49:11.000+01:00
diff --git a/src/modalities/dataloader/create_packed_data.py b/src/modalities/dataloader/create_packed_data.py
@@ -85,7 +85,17 @@ def _get_required_num_of_bytes_to_repr(int_to_get_repr: int) -> int:
         Returns:
             int: The number of bytes required to represent the integer.
         """
-        return math.ceil(math.log2(int_to_get_repr) / 8)
+        # we currently only supoprt token sizes of 1, 2 and 4 bytes, as implemented here:
+        # https://github.com/Modalities/modalities/blob/fix_char_bytes_indexation_mismatch/src/modalities/dataloader/dataset.py#L202
+        num_bytes = math.ceil(math.log2(int_to_get_repr) / 8)
+        if num_bytes == 1:
+            return 1
+        elif num_bytes == 2:
+            return 2
+        elif num_bytes <= 4:
+            return 4
+        else:
+            raise ValueError("Currently only support token byte sizes of 1, 2, and 4.")
 
     def _encoded_token_to_bytes(self, encoded_token: int) -> bytes:
         """
diff --git a/tests/dataloader/test_packed_dataset.py b/tests/dataloader/test_packed_dataset.py
@@ -263,7 +263,7 @@ def test_continuously_packed_index(token_size_in_bytes: int, block_size: int, to
 
 @pytest.mark.parametrize(
     "vocab_size, expected_num_bytes",
-    [(254, 1), (255, 1), (256, 1), (257, 2), (65534, 2), (65535, 2), (65536, 2), (65537, 3)],
+    [(254, 1), (255, 1), (256, 1), (257, 2), (65534, 2), (65535, 2), (65536, 2), (65537, 4), (65538, 4), (10000000, 4)],
 )
 def test__get_required_num_of_bytes_to_repr(vocab_size: int, expected_num_bytes: int):
     num_bytes = PackedDataGenerator._get_required_num_of_bytes_to_repr(int_to_get_repr=vocab_size)

Original file line number	Diff line number	Diff line change
`@@ -263,7 +263,7 @@ def test_continuously_packed_index(token_size_in_bytes: int, block_size: int, to`
`263`	`263`
`264`	`264`	`@pytest.mark.parametrize(`
`265`	`265`	`"vocab_size, expected_num_bytes",`
`266`		`- [(254, 1), (255, 1), (256, 1), (257, 2), (65534, 2), (65535, 2), (65536, 2), (65537, 3)],`
	`266`	`+ [(254, 1), (255, 1), (256, 1), (257, 2), (65534, 2), (65535, 2), (65536, 2), (65537, 4), (65538, 4), (10000000, 4)],`
`267`	`267`	`)`
`268`	`268`	`def test__get_required_num_of_bytes_to_repr(vocab_size: int, expected_num_bytes: int):`
`269`	`269`	`num_bytes = PackedDataGenerator._get_required_num_of_bytes_to_repr(int_to_get_repr=vocab_size)`