feat(bpe): optimize the merge process to make time of training on TinyStory-train < 120s

RayZhao1998 · RayZhao1998 · commit 6acd10aa6bc3 · 2025-11-08T16:35:49.000+08:00
diff --git a/cs336_basics/train_bpe.py b/cs336_basics/train_bpe.py
@@ -8,23 +8,6 @@
 pre_tokenization_pattern = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
 num_processes = 4
 
-def process_chunk(start: int, end: int, input_path: str, special_tokens: list[str]) -> dict:
-    words_counts = defaultdict(int)
-    with open(input_path, "rb") as f:
-        f.seek(start)
-        chunk = f.read(end - start)
-
-    sentences = re.split(b"|".join(re.escape(token).encode("utf-8") for token in special_tokens), chunk)
-
-    words_counts = defaultdict(int)
-    for sentence in sentences:
-        matches = re.finditer(pre_tokenization_pattern, sentence.decode('utf-8', errors='replace'))
-        for match in matches:
-            word_bytes = tuple(match.group().encode("utf-8"))
-            words_counts[word_bytes] += 1
-
-    return dict(words_counts)
-
 # return vocab and merges
 def train_bpe(
     input_path: str,
@@ -40,17 +23,12 @@ def train_bpe(
         vocab[next_index] = token.encode("utf-8")
         next_index += 1
 
-    merges: list[tuple[bytes, bytes]] = []
-
     #  --------------------------------
     # | Parallelizing Pre tokenziation |
     #  --------------------------------
     parallel_pre_tokenziation_start = time.time()
-    all_boundaries = set([0])
     with open(input_path, "rb") as f:
-        for special_token in special_tokens:
-            all_boundaries.update(find_chunk_boundaries(f, num_processes, special_token.encode("utf-8")))
-    boundaries = sorted(all_boundaries)
+        boundaries = find_chunk_boundaries(f, num_processes, "<|endoftext|>".encode("utf-8"))
 
     chunk_pairs = list(zip(boundaries[:-1], boundaries[1:]))
 
@@ -90,55 +68,52 @@ def train_bpe(
     #         words_counts[word_bytes] += 1
     # print(f"Normal Pre Tokenization cost: {time.time() - normal_pre_tokenziation_start}")
 
-    # next_index = len(vocab)
+    merges: list[tuple[bytes, bytes]] = []
 
     #  -----------------
     # | Optimized Merge |
     #  -----------------
     merge_start = time.time()
     pair_counts = defaultdict(int)
+    pair_to_word_bytes = defaultdict(set)
     for word_bytes, count in words_counts.items():
         pairs = list(zip(word_bytes[:-1], word_bytes[1:]))
         for pair in pairs:
             pair_counts[pair] += count
+            pair_to_word_bytes[pair].add(word_bytes)
     while next_index < vocab_size:
         max_pair = max(pair_counts, key=lambda x: (pair_counts[x], (vocab[x[0]], vocab[x[1]])))
         index1, index2 = max_pair
         merges.append((vocab[index1], vocab[index2]))
         vocab[next_index] = vocab[index1] + vocab[index2]
-
-        new_words_counts = defaultdict(int)
-        new_pair_counts = defaultdict(int, pair_counts)
-        for word_bytes, count in words_counts.items():
-            old_pairs = list(zip(word_bytes[:-1], word_bytes[1:]))
-            
-            if max_pair not in old_pairs:
-                new_words_counts[word_bytes] += count
-                continue
-
+        affected_word_bytes = pair_to_word_bytes[max_pair].copy()
+        
+        for affected in affected_word_bytes:
+            count = words_counts[affected]
             new_word_bytes = []
             i = 0
-            while i < len(word_bytes):
-                if i < len(word_bytes) - 1 and word_bytes[i] == index1 and word_bytes[i + 1] == index2:
+            while i < len(affected):
+                if i < len(affected) - 1 and affected[i] == index1 and affected[i + 1] == index2:
                     new_word_bytes.append(next_index)
                     i += 2
                 else:
-                    new_word_bytes.append(word_bytes[i])
+                    new_word_bytes.append(affected[i])
                     i += 1
             new_word = tuple(new_word_bytes)
-            new_words_counts[new_word] += count
+            words_counts[new_word] += count
 
-            for pair in old_pairs:
-                new_pair_counts[pair] -= count
-                if new_pair_counts[pair] <= 0:
-                    del new_pair_counts[pair]
+            for pair in zip(affected[:-1], affected[1:]):
+                pair_counts[pair] -= count
+                pair_to_word_bytes[pair].discard(affected)
+                if pair_counts[pair] <= 0:
+                    del pair_counts[pair]
+                    del pair_to_word_bytes[pair]
 
             new_pairs = list(zip(new_word[:-1], new_word[1:]))
             for pair in new_pairs:
-                new_pair_counts[pair] += count
+                pair_counts[pair] += count
+                pair_to_word_bytes[pair].add(new_word)
 
-        words_counts = new_words_counts
-        pair_counts = new_pair_counts
         next_index += 1
     print(f"Merge cost: {time.time() - merge_start}")
 
@@ -174,3 +149,20 @@ def train_bpe(
     #     next_index += 1
     # print(f"Merge cost: {time.time() - merge_start}")
     return vocab, merges
+
+def process_chunk(start: int, end: int, input_path: str, special_tokens: list[str]) -> dict:
+    words_counts = defaultdict(int)
+    with open(input_path, "rb") as f:
+        f.seek(start)
+        chunk = f.read(end - start)
+
+    sentences = re.split(b"|".join(re.escape(token).encode("utf-8") for token in special_tokens), chunk)
+
+    words_counts = defaultdict(int)
+    for sentence in sentences:
+        matches = re.finditer(pre_tokenization_pattern, sentence.decode('utf-8', errors='replace'))
+        for match in matches:
+            word_bytes = tuple(match.group().encode("utf-8"))
+            words_counts[word_bytes] += 1
+
+    return words_counts