AI-Hypercomputer
diff --git a/‎jetstream/core/orchestrator.py‎
Lines changed: 88 additions & 15 deletions b/‎jetstream/core/orchestrator.py‎
Lines changed: 88 additions & 15 deletions
diff --git a/‎jetstream/engine/mock_engine.py‎
Lines changed: 60 additions & 12 deletions b/‎jetstream/engine/mock_engine.py‎
Lines changed: 60 additions & 12 deletions
diff --git a/‎jetstream/engine/token_utils.py‎
Lines changed: 86 additions & 0 deletions b/‎jetstream/engine/token_utils.py‎
Lines changed: 86 additions & 0 deletions
@@ -91,6 +91,8 @@
 
 import grpc
 import jax
+import jax.numpy as jnp
+
 from jetstream.core.proto import jetstream_pb2
 from jetstream.core.proto import jetstream_pb2_grpc
 from jetstream.core.utils import async_multifuture
@@ -519,26 +521,59 @@ def _process_prefill_content(
       tokenizer: tokenizer_api.Tokenizer,
       is_bos: bool,
       max_prefill_length: int,
-  ) -> Tuple[jax.Array | np.ndarray, int]:
+      chunked_prefill: bool = False,
+      chunk_size: Optional[int] = None,
+  ) -> Tuple[jax.Array | np.ndarray, jax.Array, jax.Array | np.ndarray]:
     content = request.prefill_content
     if isinstance(content, str):
       # If it's text input, tokenize and pad the input.
-      return tokenizer.encode(
+      tokens, true_length = tokenizer.encode(
           content,
           is_bos=is_bos,
           max_prefill_length=max_prefill_length,
           jax_padding=self._jax_padding,
       )
+      positions = jnp.expand_dims(
+          jnp.arange(0, len(tokens), dtype=jnp.int32), 0
+      )
+
+      if chunked_prefill:
+        return token_utils.chunk_and_pad_tokens(
+            tokens[:true_length],
+            tokenizer.bos_id,
+            tokenizer.pad_id,
+            is_bos=is_bos,
+            max_prefill_length=max_prefill_length,
+            chunk_size=chunk_size,
+            jax_padding=self._jax_padding,
+        )
+      return tokens, true_length, positions
+
     else:
+      if chunked_prefill:
+        return token_utils.chunk_and_pad_tokens(
+            content,
+            tokenizer.bos_id,
+            tokenizer.pad_id,
+            is_bos=is_bos,
+            max_prefill_length=max_prefill_length,
+            chunk_size=chunk_size,
+            jax_padding=self._jax_padding,
+        )
+
       # If it's token input, pad the input.
-      return token_utils.pad_tokens(
+      tokens, true_length = token_utils.pad_tokens(
           content,
           tokenizer.bos_id,
           tokenizer.pad_id,
           is_bos=is_bos,
           max_prefill_length=max_prefill_length,
           jax_padding=self._jax_padding,
       )
+      positions = jnp.expand_dims(
+          jnp.arange(0, len(tokens), dtype=jnp.int32), 0
+      )
+      return tokens, true_length, positions
 
   def _prefill_thread(self, idx: int):
     """Thread which runs in the background performing prefills."""
@@ -566,8 +601,12 @@ def _prefill_thread(self, idx: int):
           f" is_bos: {is_bos}",
       )
       # Tokenize and padding the text or token input.
-      padded_tokens, true_length = self._process_prefill_content(
-          request, tokenizer, is_bos, prefill_engine.max_prefill_length
+      padded_tokens, true_length, _ = self._process_prefill_content(
+          request,
+          tokenizer,
+          is_bos,
+          prefill_engine.max_prefill_length,
+          False,
       )
 
       # Compute new kv cache for the prefill_content.
@@ -580,17 +619,51 @@ def _prefill_thread(self, idx: int):
         )
         request.complete = np.zeros((request.num_samples,), np.bool_)
       else:
-        prefill_result, first_token = prefill_engine.prefill(
-            params=prefill_params,
-            padded_tokens=padded_tokens,
-            true_length=true_length,
-            request_id=request.request_id,
-        )
-        request.complete = np.zeros(
-            (prefill_engine.samples_per_slot,), np.bool_
-        )
-
+        # if chunked_prefill is used,
+        if prefill_engine.use_chunked_prefill:
+          padded_chunked_tokens, true_lengths_of_chunks, positions_chunks = (
+              self._process_prefill_content(
+                  request,
+                  tokenizer,
+                  is_bos,
+                  prefill_engine.max_prefill_length,
+                  prefill_engine.use_chunked_prefill,
+                  prefill_engine.chunk_size,
+              )
+          )
+          prefill_result = None
+          for chunk_num, _ in enumerate(padded_chunked_tokens):
+            cache_so_far = (
+                {} if prefill_result is None else prefill_result["cache"]  # pylint: disable=unsubscriptable-object
+            )
+            prefill_result, first_token = prefill_engine.prefill(
+                params=prefill_params | {"cache": cache_so_far},
+                padded_tokens=padded_chunked_tokens[chunk_num],
+                true_length=true_lengths_of_chunks[chunk_num],
+                positions=positions_chunks[chunk_num],
+                previous_chunk=prefill_result,
+                complete_prompt_true_length=true_length,
+                complete_padded_prompt=padded_tokens,
+            )
+            # true_length_array is arrays of 1 true lengths so far
+            t_l_array = jnp.expand_dims(
+                jnp.arange(
+                    0,
+                    chunk_num * prefill_engine.chunk_size
+                    + true_lengths_of_chunks[chunk_num],
+                ),
+                1,
+            )
+            prefill_result["true_length_array"] = t_l_array
+        else:
+          # Compute new kv cache for the prefill_content.
+          prefill_result, first_token = prefill_engine.prefill(
+              params=prefill_params,
+              padded_tokens=padded_tokens,
+              true_length=true_length,
+          )
       request.prefill_result = prefill_result
+      request.complete = np.zeros((prefill_engine.samples_per_slot,), np.bool_)
 
       # put first token to detokenize queue
       my_detokenize_backlog = self._detokenize_backlogs[idx]
 
@@ -86,6 +86,7 @@ def __init__(
       cache_length: int,
       weight: float,
       vocab_size: int = 1024,
+      use_chunked_prefill: bool = False,
   ):
     self.prefill_cache_batch = batch_size
     self.generate_cache_batch = batch_size
@@ -96,17 +97,24 @@ def __init__(
         mesh_utils.create_device_mesh((1, 1, 1), jax.devices()), ("x", "y", "z")
     )
     self._prng_key = jax.random.PRNGKey(42)
+    self._use_chunked_prefill = use_chunked_prefill
 
   def load_params(self) -> Params:
     """Loads model weights."""
     # An integer, used to multiply inputs.
     return jnp.array([self.weight], dtype=jnp.float32)
 
+  def load_params_dict(self) -> Params:
+    """Loads model weights."""
+    # An integer, used to multiply inputs.
+    return {"params": jnp.array([self.weight], dtype=jnp.float32)}
+
   @functools.partial(
       jax.jit,
       static_argnums=(0,),
       static_argnames=("request_id",),
   )
+  # pylint: disable=unused-argument
   def prefill(
       self,
       *,
@@ -115,6 +123,10 @@ def prefill(
       padded_tokens: jax.Array,
       true_length: int,
       request_id: Optional[uuid.UUID] = None,
+      previous_chunk=None,
+      complete_padded_prompt=None,
+      complete_prompt_true_length=None,
+      positions=None,
   ) -> Tuple[Prefix, engine_api.ResultTokens]:
     """Computes a kv-cache for a new generate request.
 
@@ -133,20 +145,33 @@ def prefill(
     assert padded_tokens.ndim == 1
 
     # Generate dummy prefill cache content
-    prefill_cache = padded_tokens[None, :] * params
+    if not self._use_chunked_prefill:
+      prefill_cache = padded_tokens[None, :] * params
+    else:
+      prefill_cache = padded_tokens[None, :]
 
     # Create a dummy first generated token.
     first_generated_token = (prefill_cache.sum(axis=-1).astype(jnp.int32))[
         :, jnp.newaxis
     ]
 
-    prefix = Prefix(
-        logits=jax.random.normal(self._prng_key, (1, self.vocab_size)),
-        cache=prefill_cache,
-        next_pos=jnp.full((1, 1), true_length, dtype=jnp.int32),
-        num_generated_tokens=jnp.zeros((1, 1), dtype=jnp.int32),
-        first_token=first_generated_token,
-    )
+    if not self._use_chunked_prefill:
+      prefix = Prefix(
+          logits=jax.random.normal(self._prng_key, (1, self.vocab_size)),
+          cache=prefill_cache,
+          next_pos=jnp.full((1, 1), true_length, dtype=jnp.int32),
+          num_generated_tokens=jnp.zeros((1, 1), dtype=jnp.int32),
+          first_token=first_generated_token,
+      )
+    else:
+      prefix = {
+          "logits": jax.random.normal(self._prng_key, (1, self.vocab_size)),
+          "cache": prefill_cache,
+          "next_pos": jnp.full((1, 1), true_length, dtype=jnp.int32),
+          "generated_tokens": jnp.zeros((1, 1), dtype=jnp.int32),
+          "tokens": first_generated_token,
+          "first_token": first_generated_token,
+      }
 
     speculations = first_generated_token.shape[1]
     result_tokens = engine_api.ResultTokens(
@@ -319,15 +344,19 @@ def generate(
   )
   def insert(
       self,
-      prefix: Prefix,
+      prefix: Any,
       decode_state: DecodeState,
       slot: int,
       request_id: Optional[uuid.UUID] = None,
   ) -> DecodeState:
     """Adds `prefix` into `decode_state` at `slot`."""
-    prefill_cache = prefix.cache
+    if not self._use_chunked_prefill:
+      prefill_cache = prefix.cache
+    else:
+      prefill_cache = prefix["cache"]
+
     prefill_cache = jax.lax.dynamic_update_slice_in_dim(
-        decode_state.prefill_cache, prefill_cache, slot, axis=0
+        decode_state.prefill_cache, prefill_cache * 1.0, slot, axis=0
     )
     generate_cache = jax.lax.dynamic_update_slice_in_dim(
         decode_state.generate_cache,
@@ -342,9 +371,13 @@ def insert(
         slot * samples_per_slot,
         axis=0,
     )
+    if not self._use_chunked_prefill:
+      first_token = prefix.first_token
+    else:
+      first_token = prefix["first_token"]
     generate_tokens = jax.lax.dynamic_update_slice_in_dim(
         decode_state.generate_tokens,
-        prefix.first_token,
+        first_token,
         slot * samples_per_slot,
         axis=0,
     )
@@ -455,3 +488,18 @@ def mesh(self) -> jax.sharding.Mesh:
   def colocated_cpus(self) -> None:
     """CPU devices colocated with the engine's accelerators."""
     raise NotImplementedError
+
+  @property
+  def use_chunked_prefill(self) -> bool:
+    """Maximum prefill length."""
+    return self._use_chunked_prefill
+
+  @property
+  def chunk_size(self) -> bool:
+    """Maximum prefill length."""
+    return 2
+
+  @property
+  def prefill_chunk_size(self) -> int:
+    """Maximum prefill length."""
+    return 64
@@ -21,6 +21,7 @@
 import jax
 import jax.numpy as jnp
 import numpy as np
+import math
 from seqio.vocabularies import SentencePieceVocabulary
 from seqio.vocabularies import Vocabulary
 
@@ -98,6 +99,91 @@ def tokenize_and_pad(
   return padded_tokens, true_length
 
 
+def chunk_and_pad_tokens(
+    tokens,
+    bos_id: int,
+    pad_id: int,
+    is_bos: bool = True,
+    prefill_lengths: Optional[List[int]] = None,
+    max_prefill_length: Optional[int] = None,
+    chunk_size: Optional[int] = None,
+    jax_padding: bool = True,
+) -> Tuple[
+    List[Union[jax.Array, np.ndarray]],
+    List[Union[jax.Array, np.ndarray]],
+    List[Union[jax.Array, np.ndarray]],
+]:
+  """Chunks and pads tokens for chunked prefill
+    if total token size is 520 and chunk size is 256,
+    the function will return 3 chunks and return tuple is as follows-
+    [[t0,..t255][t256,..t511][t512,..t519]],
+    [256, 256, 7],
+    [[0,..255],[256,..511],[512..518..]]
+
+    Args:
+    tokens: Tokens.
+    bos_id: Bos ID.
+    pad_id: Pad ID.
+    is_bos: Add a beginning of sequence token if this is ture.
+    prefill_lengths: Buckets to pad the sequence to for static compilation.
+    max_prefill_length: Maximum bucket to use.
+    chunk_size: maximum size of each chunk
+    jax_padding: convert to JAX padded tokens if True.
+
+  Returns:
+    chunk_padded_tokens: List of chunked and padded tokens.
+    padded_chunk_true_lengths: List of integers - true length of each chunk
+    positions:list of position of each token in the chunk
+  """
+
+  num_tokens = len(tokens)
+  num_chunks = int(math.ceil(num_tokens / chunk_size))
+  # every entry in chunk_padded_tokens is a padded chunk
+  chunk_padded_tokens = []
+
+  # true lengths for each chunk
+  padded_chunk_true_lengths = []
+
+  # positions of tokens in each chunk
+  positions = []
+  # to be able to slice the tokens
+  tokens = jnp.array(tokens)
+  for chunk_num in range(num_chunks):
+    start = int(chunk_num * chunk_size)
+    end = jnp.minimum((chunk_num + 1) * chunk_size, num_tokens)
+    chunk_tokens = jax.lax.slice(tokens, (start,), (end,))
+    if chunk_num == 0:
+      padded_chunk, padded_chunk_true_length = pad_tokens(
+          chunk_tokens,
+          bos_id,
+          pad_id,
+          is_bos,
+          prefill_lengths,
+          max_prefill_length,
+          jax_padding,
+      )
+    else:
+      # is_bos should be false in subsequent chunks.
+      padded_chunk, padded_chunk_true_length = pad_tokens(
+          chunk_tokens,
+          bos_id,
+          pad_id,
+          False,
+          prefill_lengths,
+          max_prefill_length,
+          jax_padding,
+      )
+
+    positions_chunk = jnp.expand_dims(
+        jnp.arange(start, start + len(padded_chunk), dtype=jnp.int32), 0
+    )
+    chunk_padded_tokens.append(padded_chunk)
+    padded_chunk_true_lengths.append(padded_chunk_true_length)
+    positions.append(positions_chunk)
+
+  return chunk_padded_tokens, padded_chunk_true_lengths, positions
+
+
 def pad_tokens(
     tokens: np.ndarray,
     bos_id: int,