Merge branch 'foundation-model-stack:main' into tinto_play_main

tinhto-000 · web-flow · commit 935e77fbb1be · 2025-12-13T12:47:27.000-05:00
diff --git a/aiu_fms_testing_utils/scripts/drive_paged_programs.py b/aiu_fms_testing_utils/scripts/drive_paged_programs.py
@@ -264,9 +264,7 @@ def __custom_line_sampler(*args, **kwargs):
 max_tkv = int(os.environ["VLLM_DT_MAX_CONTEXT_LEN"])
 
 
-def __prepare_inputs(
-    batch_size, seq_length, tokenizer, enforce_sizes=[], seed=0, pad_multiple=64
-):
+def __prepare_inputs(batch_size, seq_length, tokenizer, enforce_sizes=[], seed=0):
     start = time.time()
     prompts_and_sizes, sample_key = sampler(
         DATASET_PATH,
@@ -278,7 +276,6 @@ def __prepare_inputs(
         enforce_sizes=enforce_sizes,
         truncation=allow_truncation,
         return_key=True,
-        pad_multiple=pad_multiple,
     )
     end = time.time()
     if local_rank == 0:
@@ -291,6 +288,10 @@ def __prepare_inputs(
             encoded = encoded[:seq_length]
         prompt_list.append(encoded)
 
+    if not prompt_list:
+        raise ValueError(
+            f"No valid prompt sample exists in dataset for input shape (Batch Size={batch_size}, Seq Length={seq_length})"
+        )
     if len(prompt_list) < batch_size:
         dprint(
             f"You requested {batch_size} prompts but we were only able to get {len(prompt_list)} valid prompts. We will be repeating the first prompt."
@@ -396,13 +397,7 @@ def __load_validation_info(
 # warmup with any input so compiler produces criteria json
 # TODO: Swap this with __prepare_inputs once fix for shape_id is available
 # input_ids, extra_kwargs, sample_key = __prepare_inputs(2, max_tkv, tokenizer)
-pad_multiple = 64
-if args.prefill_chunk_size > 0:
-    assert args.prefill_chunk_size % 64 == 0, (
-        "Chunk size must be a multiple of the page size"
-    )
-    pad_multiple = args.prefill_chunk_size
-prompt_list = [torch.arange(0, pad_multiple, dtype=torch.int64)]
+prompt_list = [torch.arange(0, 64, dtype=torch.int64)]
 # matching vllm warmup to pad to 2 on fp8, and no pad for fp16
 if is_fp8:
     prompt_list = prompt_list * 2
@@ -526,7 +521,7 @@ def parse_program_limit(limit_str: str) -> tuple[int, str]:
 # FIXME: filter condition for this on prompt and batch
 program_map = get_programs_prompts(
     program_criteria_list,
-    multiple=pad_multiple,
+    multiple=64,
     max_batch_size=max_batch_size,
     max_tkv=max_tkv,
     program_cycles=max_new_tokens,
@@ -547,7 +542,6 @@ def parse_program_limit(limit_str: str) -> tuple[int, str]:
                     valid_prompt_shape[1],
                     tokenizer,
                     enforce_sizes=enforce_sizes,
-                    pad_multiple=pad_multiple,
                 )
                 valid_prompts = [
                     (
@@ -601,29 +595,14 @@ def parse_program_limit(limit_str: str) -> tuple[int, str]:
                     # if there does not exist enough sequence sizes between this range, we will cycle back to the beginning
                     # in the event we don't have enough sequences that satisfy the enforce_sizes, we will repeat sequences and warn the user
                     enforce_sizes = [valid_prompt_shape[1]]
-                    if (
-                        args.enforce_homogeneous_prompt_programs
-                        or args.prefill_chunk_size > 0
-                    ):
-                        # if enforcing homogeneous prompt programs, this will get the number of bits for the sequence length and shift to get the power of 2 that is less than or equal to the sequence length
-                        tkv_cutoff = (
-                            1 << (valid_prompt_shape[1].bit_length() - 1)
-                            if args.enforce_homogeneous_prompt_programs
-                            else pad_multiple
-                        )
-
+                    if args.enforce_homogeneous_prompt_programs:
+                        # this will get the number of bits for the sequence length and shift to get the power of 2 that is less than or equal to the sequence length
+                        tkv_cutoff = 1 << (valid_prompt_shape[1].bit_length() - 1)
                         possible_seq_lengths = [
-                            _
-                            for _ in range(
-                                tkv_cutoff, valid_prompt_shape[1], pad_multiple
-                            )
+                            _ for _ in range(tkv_cutoff, valid_prompt_shape[1], 64)
                         ]
                         # favor sequences that are close to the valid prompt length
                         possible_seq_lengths.reverse()
-                        # add the valid prompt size to the end since it will already exist in the above enforce_sizes
-                        possible_seq_lengths = possible_seq_lengths + [
-                            valid_prompt_shape[1]
-                        ]
                         enforce_sizes = enforce_sizes + list(
                             itertools.islice(
                                 itertools.cycle(possible_seq_lengths),
@@ -636,7 +615,6 @@ def parse_program_limit(limit_str: str) -> tuple[int, str]:
                             valid_prompt_shape[1],
                             tokenizer,
                             enforce_sizes=enforce_sizes,
-                            pad_multiple=64,  # this should be the smallest granularity to ensure we get the largest enforce_size (if we choose chunked prefill, we want to make sure we pad to the full enforced size)
                         )
                         valid_prompts.append(
                             (
diff --git a/aiu_fms_testing_utils/utils/paged.py b/aiu_fms_testing_utils/utils/paged.py
@@ -5,6 +5,7 @@
 from typing import Any, Callable, List, MutableMapping, Optional, Tuple, Union
 import torch
 import fms.utils.spyre.paged  # noqa
+from aiu_fms_testing_utils.utils import get_pad_size
 
 
 def adjust_inputs_to_batch(input_ids: torch.Tensor, **extra_kwargs):
@@ -226,6 +227,12 @@ def generate(
     # left_padded_prompt_mask - empty_slots + context_lengths
     current_tkv_mask = torch.fill(context_lengths, input_ids.shape[1])
 
+    # if using chunked prefill, reserve a pad block
+    # reserving a pad block is required as writes to pad are done in parallel and could corrupt the real blocks
+    if prefill_chunk_size > 0:
+        pad_block_id = block_numbers.pop(0)
+        pad_slots = [(pad_block_id * BLOCK_SIZE) + pos_i for pos_i in range(BLOCK_SIZE)]
+
     slot_mapping = []
     block_table = []
     # each sequence has the possibility of a different tkv, so loop over that
@@ -244,6 +251,7 @@ def generate(
             slot_mapping_i.append(slot)
         slot_mapping.append(slot_mapping_i)
         block_table.append(block_table_i)
+
     kwargs["current_tkv_mask"] = None
     kwargs["left_padded_prompt_mask"] = None
     kwargs["use_cache"] = use_cache
@@ -300,64 +308,110 @@ def generate(
                 last_n_tokens = kwargs.get("last_n_tokens", 0)
 
                 if prefill_chunk_size > 0:
-                    left_padded_prompt_mask_seq_chunk = None
+                    required_extra_pads = (
+                        get_pad_size(current_tkv.item(), prefill_chunk_size)
+                        - current_tkv.item()
+                    )
+                    left_padded_prompt_mask_seq_chunk = (
+                        (kwargs["position_ids"][seq_i][-current_tkv.item() :] == 0).sum(
+                            dim=0
+                        )
+                        - 1
+                        + required_extra_pads
+                    )
+                    left_padded_prompt_mask_seq_chunk = (
+                        left_padded_prompt_mask_seq_chunk.unsqueeze(0)
+                    )
+                    block_seq_left_padding = required_extra_pads // BLOCK_SIZE
+
                     # Chunked prefill
                     for chunk_j in range(math.ceil(current_tkv / prefill_chunk_size)):
-                        chunk_start = -current_tkv + chunk_j * prefill_chunk_size
-                        chunk_end = -current_tkv + min(
-                            (chunk_j + 1) * prefill_chunk_size, current_tkv
-                        )
+                        # chunk_start and chunk_end are the index mappings from the original sequence
+                        if chunk_j == 0:
+                            chunk_start = 0
+                            chunk_end = prefill_chunk_size - required_extra_pads
+                        else:
+                            required_extra_pads = 0
+                            chunk_start = chunk_end
+                            chunk_end += prefill_chunk_size
+
+                        input_ids_seq_chunk = input_ids[seq_i][-current_tkv:][
+                            chunk_start:chunk_end
+                        ]
+                        slot_mapping_seq_chunk = slot_mapping[seq_i][-current_tkv:][
+                            chunk_start:chunk_end
+                        ]
+                        position_ids_seq_chunk = kwargs["position_ids"][seq_i][
+                            -current_tkv:
+                        ][chunk_start:chunk_end]
+
+                        # add the extra required padding to chunk
+                        if required_extra_pads > 0:
+                            input_ids_seq_chunk = torch.cat(
+                                (
+                                    torch.zeros(
+                                        required_extra_pads,
+                                        dtype=torch.int64,
+                                        device=input_ids_seq_chunk.device,
+                                    ),
+                                    input_ids_seq_chunk,
+                                )
+                            )
+                            slot_mapping_seq_chunk = (
+                                pad_slots * (required_extra_pads // BLOCK_SIZE)
+                                + slot_mapping_seq_chunk
+                            )
+                            position_ids_seq_chunk = torch.cat(
+                                (
+                                    torch.zeros(
+                                        required_extra_pads,
+                                        dtype=torch.int64,
+                                        device=position_ids_seq_chunk.device,
+                                    ),
+                                    position_ids_seq_chunk,
+                                )
+                            )
+
+                        input_ids_seq_chunk = input_ids_seq_chunk.unsqueeze(0).clone()
 
-                        ids_length = input_ids[seq_i].shape[0]
-                        input_ids_seq_chunk = (
-                            input_ids[seq_i][
-                                chunk_start + ids_length : chunk_end + ids_length
-                            ]
-                            .unsqueeze(0)
-                            .clone()
-                        )
-                        assert input_ids_seq_chunk.size(1) == prefill_chunk_size, (
-                            f"prefill chunk size was not equal to the chunk size. Found {input_ids_seq_chunk.size(0)}"
-                        )
-                        slots_length = len(slot_mapping[seq_i])
                         slot_mapping_seq_chunk = (
                             torch.tensor(
-                                slot_mapping[seq_i][
-                                    chunk_start + slots_length : chunk_end
-                                    + slots_length
-                                ],
+                                slot_mapping_seq_chunk,
                                 dtype=torch.int64,
                             )
                             .unsqueeze(0)
                             .clone()
                         )
-                        pids_length = kwargs["position_ids"][seq_i].shape[0]
-                        position_ids_seq_chunk = (
-                            kwargs["position_ids"][seq_i][
-                                chunk_start + pids_length : chunk_end + pids_length
-                            ]
-                            .unsqueeze(0)
-                            .clone()
+
+                        position_ids_seq_chunk = position_ids_seq_chunk.unsqueeze(
+                            0
+                        ).clone()
+
+                        assert input_ids_seq_chunk.size(1) == prefill_chunk_size, (
+                            f"prefill chunk size was not equal to the chunk size for input_ids. Found {input_ids_seq_chunk.size(0)}"
                         )
 
-                        # This view will result in a discontiguous tensor (creates a new graph during compile)
-                        # For this reason, we must explicitly make contiguous
-                        if left_padded_prompt_mask_seq_chunk is None:
-                            left_padded_prompt_mask_seq_chunk = (
-                                position_ids_seq_chunk == 0
-                            ).sum(dim=1) - 1
-                        current_tkv_mask_seq_chunk = torch.min(
-                            torch.tensor(
-                                (chunk_j + 1) * prefill_chunk_size, dtype=torch.int64
-                            ),
-                            current_tkv,
+                        assert slot_mapping_seq_chunk.size(1) == prefill_chunk_size, (
+                            f"prefill chunk size was not equal to the chunk size for slot_mapping. Found {slot_mapping_seq_chunk.size(0)}"
+                        )
+
+                        assert position_ids_seq_chunk.size(1) == prefill_chunk_size, (
+                            f"prefill chunk size was not equal to the chunk size for position_ids. Found {position_ids_seq_chunk.size(0)}"
+                        )
+
+                        current_tkv_mask_seq_chunk = torch.tensor(
+                            (chunk_j + 1) * prefill_chunk_size, dtype=torch.int64
                         ).unsqueeze(0)
 
-                        table_length = len(block_table[seq_i])
-                        block_start = -current_tkv // BLOCK_SIZE + table_length
-                        block_end = chunk_end // BLOCK_SIZE + table_length
+                        block_end = chunk_end // BLOCK_SIZE
+                        # length of padding or index until padding has occured in block table
+                        block_pad_len = (input_ids.shape[1] - current_tkv) // BLOCK_SIZE
                         block_table_seq_chunk = torch.tensor(
-                            block_table[seq_i][block_start:block_end], dtype=torch.int64
+                            [pad_block_id] * (block_seq_left_padding)
+                            + block_table[seq_i][
+                                block_pad_len : block_pad_len + block_end
+                            ],
+                            dtype=torch.int64,
                         ).unsqueeze(0)
 
                         chunked_kwargs = {
diff --git a/tests/models/test_scripts.py b/tests/models/test_scripts.py
@@ -175,17 +175,25 @@ def execute_dpp(
     test_type,
     skip_validation,
     enforce_homogeneous_prompt_programs,
+    prefill_chunk_size,
     shared_tmp_path,
     isolated_env,
 ):
     isolated_env["VLLM_DT_MAX_BATCH_TKV_LIMIT"] = "1024"
     isolated_env["VLLM_DT_MAX_CONTEXT_LEN"] = "512"
     isolated_env["VLLM_DT_MAX_BATCH_SIZE"] = "2"
+    if prefill_chunk_size > 0:
+        isolated_env["VLLM_DT_CHUNK_LEN"] = f"{prefill_chunk_size}"
     Path(os.path.join(shared_tmp_path, "sendnn_cache")).mkdir(exist_ok=True)
-    os.environ.setdefault(
-        "TORCH_SENDNN_CACHE_DIR", os.path.join(shared_tmp_path, "sendnn_cache")
-    )
-    isolated_env["TORCH_SENDNN_CACHE_ENABLE"] = "1"
+
+    # only enable for non-chunk
+    if prefill_chunk_size == 0:
+        os.environ.setdefault(
+            "TORCH_SENDNN_CACHE_DIR", os.path.join(shared_tmp_path, "sendnn_cache")
+        )
+        isolated_env["TORCH_SENDNN_CACHE_ENABLE"] = "1"
+    else:
+        isolated_env["TORCH_SENDNN_CACHE_ENABLE"] = "0"
 
     command_list = [
         "python3",
@@ -239,6 +247,9 @@ def execute_dpp(
     if enforce_homogeneous_prompt_programs:
         command_list += ["--enforce_homogeneous_prompt_programs"]
 
+    if prefill_chunk_size > 0:
+        command_list += [f"--prefill_chunk_size={prefill_chunk_size}"]
+
     # add program criteria path
     command_list += [
         f"--program_criteria_json_path={os.environ['DT_PROG_CRITERIA_FILEPATH']}"
@@ -249,21 +260,24 @@ def execute_dpp(
 
 dpp_possibilities = []
 dpp_possibilities.append(
-    ("paged", None, 8, "sharegpt", "metrics", False, False)
+    ("paged", None, 8, "sharegpt", "metrics", False, False, 0)
 )  # metrics and run all programs
 dpp_possibilities.append(
-    ("paged", "*:0,==256", 65, "sharegpt", "tokens", False, False)
+    ("paged", "*:0,==256", 65, "sharegpt", "tokens", False, False, 0)
 )  # tokens and run all programs that satisfy 256 sequence length
 dpp_possibilities.append(
-    ("paged", "*:>=2,0", 65, "sharegpt", None, True, True)
+    ("paged", "*:>=2,0", 65, "sharegpt", None, True, True, 0)
 )  # metrics and run all programs that have >=2 batch size
 dpp_possibilities.append(
-    ("paged", None, 8, "custom", "tokens", False, False)
+    ("paged", None, 8, "custom", "tokens", False, False, 0)
 )  # tokens running with specific custom dataset
+dpp_possibilities.append(
+    ("paged", None, 8, "sharegpt", "tokens", False, False, 128)
+)  # metrics and run all programs with chunked prefill
 
 
 @pytest.mark.parametrize(
-    "attn_type,programs,max_new_tokens,dataset_type,test_type,skip_validation,enforce_homogeneous_prompt_programs",
+    "attn_type,programs,max_new_tokens,dataset_type,test_type,skip_validation,enforce_homogeneous_prompt_programs,prefill_chunk_size",
     dpp_possibilities,
 )
 def test_dpp_script(
@@ -274,6 +288,7 @@ def test_dpp_script(
     test_type,
     skip_validation,
     enforce_homogeneous_prompt_programs,
+    prefill_chunk_size,
     shared_tmp_path,
     isolated_env,
 ):
@@ -290,6 +305,7 @@ def test_dpp_script(
         test_type,
         skip_validation,
         enforce_homogeneous_prompt_programs,
+        prefill_chunk_size,
         shared_tmp_path,
         isolated_env,
     )