AI-Hypercomputer
diff --git a/‎ ‎ b/‎ ‎
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎dependencies/requirements/base_requirements/requirements.txt‎
Lines changed: 2 additions & 0 deletions b/‎dependencies/requirements/base_requirements/requirements.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎dependencies/requirements/generated_requirements/requirements.txt‎
Lines changed: 2 additions & 0 deletions b/‎dependencies/requirements/generated_requirements/requirements.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/maxdiffusion/checkpointing/wan_checkpointer_2_1.py‎
Lines changed: 26 additions & 9 deletions b/‎src/maxdiffusion/checkpointing/wan_checkpointer_2_1.py‎
Lines changed: 26 additions & 9 deletions
diff --git a/‎src/maxdiffusion/checkpointing/wan_checkpointer_i2v_2p1.py‎
Lines changed: 26 additions & 9 deletions b/‎src/maxdiffusion/checkpointing/wan_checkpointer_i2v_2p1.py‎
Lines changed: 26 additions & 9 deletions
diff --git a/‎src/maxdiffusion/checkpointing/wan_vace_checkpointer_2_1.py‎
Lines changed: 105 additions & 0 deletions b/‎src/maxdiffusion/checkpointing/wan_vace_checkpointer_2_1.py‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base14.yml‎
Lines changed: 3 additions & 0 deletions b/‎src/maxdiffusion/configs/base14.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base21.yml‎
Lines changed: 3 additions & 0 deletions b/‎src/maxdiffusion/configs/base21.yml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/maxdiffusion/configs/base_2_base.yml‎
Lines changed: 3 additions & 0 deletions b/‎src/maxdiffusion/configs/base_2_base.yml‎
Lines changed: 3 additions & 0 deletions
@@ -6,6 +6,7 @@ __pycache__/
 *$py.class
 # C extensions
 *.so
+Gemini.md
 
 # tests and logs
 tests/fixtures/cached_*_text.txt
 
@@ -1,5 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 absl-py
+accelerate
 aqtp
 chex
 datasets
@@ -14,6 +15,7 @@ imageio-ffmpeg
 imageio
 jax
 jaxlib
+jaxopt
 Jinja2
 opencv-python-headless
 optax
 
@@ -2,6 +2,7 @@
 # If you need to modify dependencies, please do so in the host requirements file and run seed-env again.
 
 absl-py>=2.3.1
+accelerate>=1.13.0
 aiofiles>=25.1.0
 aiohappyeyeballs>=2.6.1
 aiohttp>=3.13.3
@@ -80,6 +81,7 @@ isort>=8.0.1
 jaraco-functools>=4.4.0
 jax>=0.9.0
 jaxlib>=0.9.0
+jaxopt>=0.8.5
 jaxtyping>=0.3.9
 jinja2>=3.1.6
 keras>=3.13.1
 
@@ -15,14 +15,15 @@
 """
 
 import json
-import jax
-import numpy as np
 from typing import Optional, Tuple
-from ..pipelines.wan.wan_pipeline_2_1 import WanPipeline2_1
-from .. import max_logging
-import orbax.checkpoint as ocp
 from etils import epath
+import jax
+from jax.sharding import Mesh, NamedSharding, PartitionSpec as P
 from maxdiffusion.checkpointing.wan_checkpointer import WanCheckpointer
+import numpy as np
+import orbax.checkpoint as ocp
+from .. import max_logging
+from ..pipelines.wan.wan_pipeline_2_1 import WanPipeline2_1
 
 
 class WanCheckpointer2_1(WanCheckpointer):
@@ -35,13 +36,29 @@ def load_wan_configs_from_orbax(self, step: Optional[int]) -> Tuple[Optional[dic
         max_logging.log("No WAN checkpoint found.")
         return None, None
     max_logging.log(f"Loading WAN checkpoint from step {step}")
+
+    cpu_devices = np.array(jax.devices(backend="cpu"))
+    mesh = Mesh(cpu_devices, axis_names=("data",))
+    replicated_sharding = NamedSharding(mesh, P())
+
     metadatas = self.checkpoint_manager.item_metadata(step)
-    transformer_metadata = metadatas.wan_state
-    abstract_tree_structure_params = jax.tree_util.tree_map(ocp.utils.to_shape_dtype_struct, transformer_metadata)
+    state = metadatas.wan_state
+
+    def add_sharding_to_struct(leaf_struct, sharding):
+      struct = ocp.utils.to_shape_dtype_struct(leaf_struct)
+      if hasattr(struct, "shape") and hasattr(struct, "dtype"):
+        return jax.ShapeDtypeStruct(shape=struct.shape, dtype=struct.dtype, sharding=sharding)
+      return struct
+
+    target_shardings = jax.tree_util.tree_map(lambda x: replicated_sharding, state)
+
+    with mesh:
+      abstract_train_state_with_sharding = jax.tree_util.tree_map(add_sharding_to_struct, state, target_shardings)
+
     params_restore = ocp.args.PyTreeRestore(
         restore_args=jax.tree.map(
-            lambda _: ocp.RestoreArgs(restore_type=np.ndarray),
-            abstract_tree_structure_params,
+            lambda _: ocp.RestoreArgs(restore_type=jax.Array),
+            abstract_train_state_with_sharding,
         )
     )
 
 
@@ -15,14 +15,15 @@
 """
 
 import json
-import jax
-import numpy as np
 from typing import Optional, Tuple
-from ..pipelines.wan.wan_pipeline_i2v_2p1 import WanPipelineI2V_2_1
-from .. import max_logging
-import orbax.checkpoint as ocp
 from etils import epath
+import jax
+from jax.sharding import Mesh, NamedSharding, PartitionSpec as P
 from maxdiffusion.checkpointing.wan_checkpointer import WanCheckpointer
+import numpy as np
+import orbax.checkpoint as ocp
+from .. import max_logging
+from ..pipelines.wan.wan_pipeline_i2v_2p1 import WanPipelineI2V_2_1
 
 
 class WanCheckpointerI2V_2_1(WanCheckpointer):
@@ -35,13 +36,29 @@ def load_wan_configs_from_orbax(self, step: Optional[int]) -> Tuple[Optional[dic
         max_logging.log("No WAN checkpoint found.")
         return None, None
     max_logging.log(f"Loading WAN checkpoint from step {step}")
+
+    cpu_devices = np.array(jax.devices(backend="cpu"))
+    mesh = Mesh(cpu_devices, axis_names=("data",))
+    replicated_sharding = NamedSharding(mesh, P())
+
     metadatas = self.checkpoint_manager.item_metadata(step)
-    transformer_metadata = metadatas.wan_state
-    abstract_tree_structure_params = jax.tree_util.tree_map(ocp.utils.to_shape_dtype_struct, transformer_metadata)
+    state = metadatas.wan_state
+
+    def add_sharding_to_struct(leaf_struct, sharding):
+      struct = ocp.utils.to_shape_dtype_struct(leaf_struct)
+      if hasattr(struct, "shape") and hasattr(struct, "dtype"):
+        return jax.ShapeDtypeStruct(shape=struct.shape, dtype=struct.dtype, sharding=sharding)
+      return struct
+
+    target_shardings = jax.tree_util.tree_map(lambda x: replicated_sharding, state)
+
+    with mesh:
+      abstract_train_state_with_sharding = jax.tree_util.tree_map(add_sharding_to_struct, state, target_shardings)
+
     params_restore = ocp.args.PyTreeRestore(
         restore_args=jax.tree.map(
-            lambda _: ocp.RestoreArgs(restore_type=np.ndarray),
-            abstract_tree_structure_params,
+            lambda _: ocp.RestoreArgs(restore_type=jax.Array),
+            abstract_train_state_with_sharding,
         )
     )
 
 
@@ -0,0 +1,105 @@
+"""Copyright 2025 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import json
+from typing import Optional, Tuple
+import jax
+from jax.sharding import Mesh, NamedSharding, PartitionSpec as P
+from maxdiffusion.checkpointing.wan_checkpointer import WanCheckpointer
+import numpy as np
+import orbax.checkpoint as ocp
+from .. import max_logging
+from ..pipelines.wan.wan_vace_pipeline_2_1 import VaceWanPipeline2_1
+
+
+class WanVaceCheckpointer2_1(WanCheckpointer):
+
+  def load_wan_configs_from_orbax(self, step: Optional[int]) -> Tuple[Optional[dict], Optional[int]]:
+    if step is None:
+      step = self.checkpoint_manager.latest_step()
+      max_logging.log(f"Latest WAN checkpoint step: {step}")
+      if step is None:
+        max_logging.log("No WAN checkpoint found.")
+        return None, None
+    max_logging.log(f"Loading WAN checkpoint from step {step}")
+
+    cpu_devices = np.array(jax.devices(backend="cpu"))
+    mesh = Mesh(cpu_devices, axis_names=("data",))
+    replicated_sharding = NamedSharding(mesh, P())
+
+    metadatas = self.checkpoint_manager.item_metadata(step)
+    state = metadatas.wan_state
+
+    def add_sharding_to_struct(leaf_struct, sharding):
+      struct = ocp.utils.to_shape_dtype_struct(leaf_struct)
+      if hasattr(struct, "shape") and hasattr(struct, "dtype"):
+        return jax.ShapeDtypeStruct(shape=struct.shape, dtype=struct.dtype, sharding=sharding)
+      return struct
+
+    target_shardings = jax.tree_util.tree_map(lambda x: replicated_sharding, state)
+
+    with mesh:
+      abstract_train_state_with_sharding = jax.tree_util.tree_map(add_sharding_to_struct, state, target_shardings)
+
+    max_logging.log("Restoring WAN checkpoint")
+    restored_checkpoint = self.checkpoint_manager.restore(
+        step=step,
+        args=ocp.args.Composite(
+            wan_config=ocp.args.JsonRestore(),
+            wan_state=ocp.args.StandardRestore(abstract_train_state_with_sharding),
+        ),
+    )
+    max_logging.log(f"restored checkpoint {restored_checkpoint.keys()}")
+    max_logging.log(f"restored checkpoint wan_state {restored_checkpoint.wan_state.keys()}")
+    max_logging.log(f"optimizer found in checkpoint {'opt_state' in restored_checkpoint.wan_state.keys()}")
+    max_logging.log(f"optimizer state saved in attribute self.opt_state {self.opt_state}")
+    return restored_checkpoint, step
+
+  def load_diffusers_checkpoint(self):
+    pipeline = VaceWanPipeline2_1.from_pretrained(self.config)
+    return pipeline
+
+  def load_checkpoint(self, step=None) -> Tuple[VaceWanPipeline2_1, Optional[dict], Optional[int]]:
+    restored_checkpoint, step = self.load_wan_configs_from_orbax(step)
+    opt_state = None
+    if restored_checkpoint:
+      max_logging.log("Loading WAN pipeline from checkpoint")
+      pipeline = VaceWanPipeline2_1.from_checkpoint(self.config, restored_checkpoint)
+      if "opt_state" in restored_checkpoint.wan_state.keys():
+        opt_state = restored_checkpoint.wan_state["opt_state"]
+    else:
+      max_logging.log("No checkpoint found, loading default pipeline.")
+      pipeline = self.load_diffusers_checkpoint()
+
+    return pipeline, opt_state, step
+
+  def save_checkpoint(self, train_step, pipeline: VaceWanPipeline2_1, train_states: dict):
+    """Saves the training state and model configurations."""
+
+    def config_to_json(model_or_config):
+      return json.loads(model_or_config.to_json_string())
+
+    max_logging.log(f"Saving checkpoint for step {train_step}")
+
+    # Save the checkpoint
+    self.checkpoint_manager.save(
+        train_step,
+        args=ocp.args.Composite(
+            wan_config=ocp.args.JsonSave(config_to_json(pipeline.transformer)),
+            wan_state=ocp.args.StandardSave(train_states),
+        ),
+    )
+
+    max_logging.log(f"Checkpoint for step {train_step} is saved.")
@@ -206,6 +206,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
 adam_b2: 0.999 # Exponential decay rate to track the second moment of past gradients.
 adam_eps: 1.e-8 # A small constant applied to denominator outside of the square root.
 adam_weight_decay: 1.e-2 # AdamW Weight decay
+opt_enable_grad_clipping: False
+max_grad_value: 1.0
+opt_enable_grad_global_norm_clipping: False
 max_grad_norm: 1.0
 
 enable_profiler: False
 
@@ -211,6 +211,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
 adam_b2: 0.999 # Exponential decay rate to track the second moment of past gradients.
 adam_eps: 1.e-8 # A small constant applied to denominator outside of the square root.
 adam_weight_decay: 1.e-2 # AdamW Weight decay
+opt_enable_grad_clipping: False
+max_grad_value: 1.0
+opt_enable_grad_global_norm_clipping: False
 max_grad_norm: 1.0
 
 enable_profiler: False
 
@@ -221,6 +221,9 @@ adam_b1: 0.9 # Exponential decay rate to track the first moment of past gradient
 adam_b2: 0.999 # Exponential decay rate to track the second moment of past gradients.
 adam_eps: 1.e-8 # A small constant applied to denominator outside of the square root.
 adam_weight_decay: 1.e-2 # AdamW Weight decay
+opt_enable_grad_clipping: False
+max_grad_value: 1.0
+opt_enable_grad_global_norm_clipping: False
 max_grad_norm: 1.0
 
 enable_profiler: False