AI-Hypercomputer
diff --git a/‎recml/core/ops/hstu_ops.py‎
Lines changed: 8 additions & 8 deletions b/‎recml/core/ops/hstu_ops.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎recml/core/training/partitioning.py‎
Lines changed: 5 additions & 5 deletions b/‎recml/core/training/partitioning.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎recml/examples/DLRM_HSTU/action_encoder.py‎
Lines changed: 123 additions & 0 deletions b/‎recml/examples/DLRM_HSTU/action_encoder.py‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎recml/examples/DLRM_HSTU/action_encoder_test.py‎
Lines changed: 147 additions & 0 deletions b/‎recml/examples/DLRM_HSTU/action_encoder_test.py‎
Lines changed: 147 additions & 0 deletions
@@ -125,9 +125,9 @@ def _apply_mask(
   masks = []
   if mask_ref is not None:
     if k_in_lanes:
-      mask = pl.load(mask_ref, (slice(None), k_slice))
+      mask = mask_ref[:, k_slice]
     else:
-      mask = pl.load(mask_ref, (k_slice, slice(None)))
+      mask = mask_ref[k_slice, :]
 
     snm = jnp.where(should_not_mask, 1, 0)
     masks.append(jnp.bitwise_or(mask, jnp.broadcast_to(snm, mask.shape)) != 0)
@@ -156,7 +156,7 @@ def _apply_mask(
       k_sequence = k_offset + jax.lax.broadcasted_iota(
           jnp.int32, (k_slice.size, bq), 0
       )
-      q_sequence = pl.load(q_sequence_ref, (pl.ds(1), slice(None)))  # [1, bq]
+      q_sequence = q_sequence_ref[:1, :]  # [1, bq]
       q_sequence = jnp.broadcast_to(q_sequence, (k_slice.size, bq))
 
     assert q_sequence.shape == k_sequence.shape
@@ -170,7 +170,7 @@ def _apply_mask(
 
   if q_segment_ids_ref is not None:
     if k_in_lanes:
-      kv_ids = pl.load(kv_segment_ids_ref, (pl.ds(1), k_slice))  # [1, k_slice]
+      kv_ids = kv_segment_ids_ref[:1, k_slice]  # [1, k_slice]
       repeats, rem = divmod(kv_ids.shape[1], NUM_LANES)
       if rem:
         raise NotImplementedError(f"block_kv must be a multiple of {NUM_LANES}")
@@ -181,9 +181,9 @@ def _apply_mask(
       if rem:
         raise NotImplementedError(f"block_q must be a multiple of {NUM_LANES}")
       kv_ids = pltpu.repeat(
-          pl.load(kv_segment_ids_ref, (k_slice, slice(None))), repeats, axis=1
+          kv_segment_ids_ref[k_slice, :], repeats, axis=1
       )  # [k_slice, bq]
-      q_ids = pl.load(q_segment_ids_ref, (pl.ds(1), slice(None)))  # [1, bq]
+      q_ids = q_segment_ids_ref[:1, :]  # [1, bq]
     masks.append(q_ids == kv_ids)
 
   if masks:
@@ -228,7 +228,7 @@ def body(kv_compute_index, _):
     slice_k = pl.ds(kv_compute_index * bkv_compute, bkv_compute)
 
     q = q_ref[...]
-    k = pl.load(k_ref, (slice_k, slice(None)))
+    k = k_ref[slice_k, :]
     qk = jax.lax.dot_general(
         q, k, NT_DIM_NUMBERS, preferred_element_type=jnp.float32
     )
@@ -256,7 +256,7 @@ def body(kv_compute_index, _):
     )
 
     sv_dims = NN_DIM_NUMBERS
-    v = pl.load(v_ref, (slice_k, slice(None)))
+    v = v_ref[slice_k, :]
 
     to_float32 = lambda x: x.astype(jnp.float32)
     v = to_float32(v)
 
@@ -107,7 +107,7 @@ def _shard(x: np.ndarray) -> jax.Array:
   def partition_init(
       self, init_fn: CreateStateFn, *, abstract_batch: PyTree | None = None
   ) -> CreateStateFn:
-    with jax.sharding.use_mesh(self.mesh):
+    with jax.set_mesh(self.mesh):
       if abstract_batch is not None:
         abstract_state = jax.eval_shape(init_fn, abstract_batch)
         specs = nn.get_partition_spec(abstract_state)
@@ -117,7 +117,7 @@ def partition_init(
       init_fn = jax.jit(init_fn, out_shardings=self.state_sharding)
 
     def _wrapped_init(batch: PyTree) -> State:
-      with jax.sharding.use_mesh(self.mesh):
+      with jax.set_mesh(self.mesh):
         state = init_fn(batch)
         state = _maybe_unbox_state(state)
         return state
@@ -130,15 +130,15 @@ def partition_step(self, fn: StepFn, *, training: bool = False) -> StepFn:
       jit_kws["out_shardings"] = (self.state_sharding, None)
       jit_kws["donate_argnums"] = (1,)
 
-    with jax.sharding.use_mesh(self.mesh):
+    with jax.set_mesh(self.mesh):
       step_fn = jax.jit(
           fn,
           in_shardings=(self.data_sharding, self.state_sharding),
           **jit_kws,
       )
 
     def _wrapped_step(batch: PyTree, state: State) -> Any:
-      with jax.sharding.use_mesh(self.mesh):
+      with jax.set_mesh(self.mesh):
         return step_fn(batch, state)
 
     return _wrapped_step
@@ -217,7 +217,7 @@ def __init__(
   def mesh_context_manager(
       self,
   ) -> Callable[[jax.sharding.Mesh], ContextManager[None]]:
-    return jax.sharding.use_mesh
+    return jax.set_mesh
 
   def shard_inputs(self, inputs: PyTree) -> PyTree:
     def _shard(x: np.ndarray) -> jax.Array:
 
@@ -0,0 +1,123 @@
+# Copyright 2024 RecML authors <recommendations-ml@google.com>.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""JAX implementation of the ActionEncoder module."""
+
+from typing import Dict, List, Optional, Tuple
+
+import flax.linen as nn
+from flax.linen import initializers
+import jax
+import jax.numpy as jnp
+
+
+class ActionEncoder(nn.Module):
+  """Encodes categorical actions and continuous watch times into a fixed-size embedding.
+
+  assumes dense tensors of shape (batch_size, sequence_length) for all inputs.
+  """
+
+  action_embedding_dim: int
+  action_feature_name: str
+  action_weights: List[int]
+  watchtime_feature_name: str = ""
+  watchtime_to_action_thresholds_and_weights: Optional[
+      List[Tuple[int, int]]
+  ] = None
+
+  def setup(self):
+    """Initializes parameters and constants for the module."""
+    wt_thresholds_and_weights = (
+        self.watchtime_to_action_thresholds_and_weights or []
+    )
+
+    self.combined_action_weights = jnp.array(
+        list(self.action_weights) + [w for _, w in wt_thresholds_and_weights]
+    )
+
+    self.num_action_types: int = (
+        len(self.action_weights) + len(wt_thresholds_and_weights)
+    )
+
+    self.action_embedding_table = self.param(
+        "action_embedding_table",
+        initializers.normal(stddev=0.1),
+        (self.num_action_types, self.action_embedding_dim),
+    )
+
+    self.target_action_embedding_table = self.param(
+        "target_action_embedding_table",
+        initializers.normal(stddev=0.1),
+        (1, self.output_embedding_dim),
+    )
+
+  @property
+  def output_embedding_dim(self) -> int:
+    """The dimension of the final output embedding."""
+    num_watchtime_actions = (
+        len(self.watchtime_to_action_thresholds_and_weights)
+        if self.watchtime_to_action_thresholds_and_weights
+        else 0
+    )
+    num_action_types = len(self.action_weights) + num_watchtime_actions
+    return self.action_embedding_dim * num_action_types
+
+  def __call__(
+      self,
+      seq_payloads: Dict[str, jax.Array],
+      is_target_mask: jax.Array,
+  ) -> jax.Array:
+    """Processes a batch of sequences to generate action embeddings.
+
+    Args:
+        seq_payloads: A dictionary of feature names to dense tensors of shape
+          `(batch_size, sequence_length)`.
+        is_target_mask: A boolean tensor of shape `(batch_size,
+          sequence_length)` where `True` indicates a target item.
+
+    Returns:
+        A dense tensor of action embeddings of shape
+        `(batch_size, sequence_length, output_embedding_dim)`.
+    """
+
+    seq_actions = seq_payloads[self.action_feature_name]
+
+    wt_thresholds_and_weights = (
+        self.watchtime_to_action_thresholds_and_weights or []
+    )
+    if wt_thresholds_and_weights:
+      watchtimes = seq_payloads[self.watchtime_feature_name]
+      for threshold, weight in wt_thresholds_and_weights:
+        watch_action = (watchtimes >= threshold).astype(jnp.int64) * weight
+        seq_actions = jnp.bitwise_or(seq_actions, watch_action)
+
+    exploded_actions = (
+        jnp.bitwise_and(seq_actions[..., None], self.combined_action_weights)
+        > 0
+    )
+
+    history_embeddings = (
+        exploded_actions[..., None] * self.action_embedding_table
+    ).reshape(*seq_actions.shape, -1)
+
+    target_embeddings = jnp.broadcast_to(
+        self.target_action_embedding_table, history_embeddings.shape
+    )
+
+    final_embeddings = jnp.where(
+        is_target_mask[..., None],
+        target_embeddings,
+        history_embeddings,
+    )
+
+    return final_embeddings
@@ -0,0 +1,147 @@
+# Copyright 2024 RecML authors <recommendations-ml@google.com>.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import jax
+import jax.numpy as jnp
+import numpy as np
+import numpy.testing as npt
+# from third_party.py.pybase import googletest
+from absl.testing import absltest
+from recml.examples.DLRM_HSTU.action_encoder import ActionEncoder
+
+
+class ActionEncoderJaxTest(absltest.TestCase):
+  def test_forward_and_backward(self) -> None:
+    """Tests the ActionEncoder's forward pass logic and differentiability."""
+
+    batch_size = 2
+    max_seq_len = 6
+    action_embedding_dim = 32
+    action_weights = [1, 2, 4, 8, 16]
+    watchtime_to_action_thresholds_and_weights = [
+        (30, 32), (60, 64), (100, 128),
+    ]
+    num_action_types = len(action_weights) + len(
+        watchtime_to_action_thresholds_and_weights
+    )
+    output_dim = action_embedding_dim * num_action_types
+    combined_action_weights = action_weights + [
+        w for _, w in watchtime_to_action_thresholds_and_weights
+    ]
+
+    enabled_actions = [
+        [0],          # Seq 1, Item 1
+        [0, 1],       # Seq 1, Item 2
+        [1, 3, 4],    # Seq 1, Item 3
+        [1, 2, 3, 4], # Seq 1, Item 4
+        [1, 2],       # Seq 2, Item 1
+        [2],          # Seq 2, Item 2
+    ]
+    watchtimes_flat = [40, 20, 110, 31, 26, 55]
+
+    # Add actions based on watchtime thresholds
+    for i, wt in enumerate(watchtimes_flat):
+      for j, (threshold, _) in enumerate(
+          watchtime_to_action_thresholds_and_weights
+      ):
+        if wt > threshold:
+          enabled_actions[i].append(j + len(action_weights))
+
+    actions_flat = [
+        sum([combined_action_weights[t] for t in x]) for x in enabled_actions
+    ]
+
+    padded_actions = np.zeros((batch_size, max_seq_len), dtype=np.int64)
+    padded_watchtimes = np.zeros((batch_size, max_seq_len), dtype=np.int64)
+
+    padded_actions[0, :4] = actions_flat[0:4]
+    padded_actions[1, :2] = actions_flat[4:6]
+    padded_watchtimes[0, :4] = watchtimes_flat[0:4]
+    padded_watchtimes[1, :2] = watchtimes_flat[4:6]
+
+    is_target_mask = np.zeros((batch_size, max_seq_len), dtype=bool)
+    is_target_mask[0, 4:6] = True
+    is_target_mask[1, 2] = True
+
+    padding_mask = np.zeros((batch_size, max_seq_len), dtype=bool)
+    padding_mask[0, :6] = True
+    padding_mask[1, :3] = True
+
+    seq_payloads = {
+        "watchtimes": jnp.array(padded_watchtimes),
+        "actions": jnp.array(padded_actions),
+    }
+
+    encoder = ActionEncoder(
+        watchtime_feature_name="watchtimes",
+        action_feature_name="actions",
+        action_weights=action_weights,
+        watchtime_to_action_thresholds_and_weights=(
+            watchtime_to_action_thresholds_and_weights
+        ),
+        action_embedding_dim=action_embedding_dim,
+    )
+
+    key = jax.random.PRNGKey(0)
+    variables = encoder.init(key, seq_payloads, is_target_mask)
+    params = variables["params"]
+
+    action_embeddings = encoder.apply(
+        variables, seq_payloads, is_target_mask
+    )
+
+    self.assertEqual(
+        action_embeddings.shape, (batch_size, max_seq_len, output_dim)
+    )
+
+    action_table = params["action_embedding_table"]
+    target_table_flat = params["target_action_embedding_table"]
+    target_table = target_table_flat.reshape(num_action_types, -1)
+
+    history_item_idx = 0
+    for b in range(batch_size):
+      for s in range(max_seq_len):
+        if not padding_mask[b, s]:
+          npt.assert_allclose(action_embeddings[b, s], 0, atol=1e-6)
+          continue
+
+        embedding = action_embeddings[b, s].reshape(num_action_types, -1)
+
+        if is_target_mask[b, s]:
+          npt.assert_allclose(embedding, target_table, atol=1e-6)
+        else:
+          current_enabled = enabled_actions[history_item_idx]
+          for atype in range(num_action_types):
+            if atype in current_enabled:
+              npt.assert_allclose(
+                  embedding[atype], action_table[atype], atol=1e-6
+              )
+            else:
+              npt.assert_allclose(embedding[atype],
+                                  jnp.zeros_like(embedding[atype]),
+                                  atol=1e-6)
+          history_item_idx += 1
+
+    def loss_fn(p):
+      return encoder.apply({"params": p}, seq_payloads, is_target_mask).sum()
+
+    grads = jax.grad(loss_fn)(params)
+    self.assertIsNotNone(grads)
+    self.assertFalse(np.all(np.isclose(grads["action_embedding_table"], 0)))
+    self.assertFalse(np.all(
+        np.isclose(grads["target_action_embedding_table"], 0)
+    ))
+
+
+if __name__ == "__main__":
+    absltest.main()