connectors and feat extractors

prishajain1 · prishajain1 · commit 84aa5ef5147e · 2026-04-09T13:25:31.000+05:30
diff --git a/src/maxdiffusion/models/ltx2/ltx2_3_utils.py b/src/maxdiffusion/models/ltx2/ltx2_3_utils.py
@@ -17,8 +17,10 @@
     "model.diffusion_model.": "",
     "connectors.": "",
     "transformer_1d_blocks": "stacked_blocks",
-    "text_embedding_projection.audio_aggregate_embed": "audio_text_proj_in",
-    "text_embedding_projection.video_aggregate_embed": "video_text_proj_in",
+    "text_embedding_projection.audio_aggregate_embed.weight": "feature_extractor.audio_linear.kernel",
+    "text_embedding_projection.audio_aggregate_embed.bias": "feature_extractor.audio_linear.bias",
+    "text_embedding_projection.video_aggregate_embed.weight": "feature_extractor.video_linear.kernel",
+    "text_embedding_projection.video_aggregate_embed.bias": "feature_extractor.video_linear.bias",
     "q_norm": "norm_q",
     "k_norm": "norm_k",
     "norm_q.weight": "norm_q.scale",
@@ -91,6 +93,10 @@ def load_connectors_weights(
           
         accumulated_stacked[base_key][layer_idx] = tensor
       else:
+        # Transpose projection kernels in feature extractor
+        if "feature_extractor" in segments and segments[-1] == "kernel":
+          tensor = jnp.transpose(tensor, (1, 0))
+          
         flax_key = _tuple_str_to_int(segments)
         flax_state_dict[flax_key] = jax.device_put(tensor, device=cpu)
 
diff --git a/src/maxdiffusion/models/ltx2/ltx2_utils.py b/src/maxdiffusion/models/ltx2/ltx2_utils.py
@@ -496,6 +496,9 @@ def rename_for_ltx2_audio_vae(key):
   if "upsample.conv.bias" in key:
     key = key.replace("upsample.conv.bias", "upsample.conv.conv.bias")
 
+  key = key.replace("per_channel_statistics.mean-of-means", "latents_mean")
+  key = key.replace("per_channel_statistics.std-of-means", "latents_std")
+
   return key
 
 
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/feature_extractor_ltx2.py b/src/maxdiffusion/models/ltx2/text_encoders/feature_extractor_ltx2.py
@@ -104,17 +104,21 @@ def __init__(
       rngs: nnx.Rngs = None,
       per_modality_projections: bool = False,
       use_bias: bool = False,
+      video_output_dim: Optional[int] = None,
+      audio_output_dim: Optional[int] = None,
   ):
     """
     Args:
         input_dim: Dimension of flattened hidden states (Gemma dim * Num layers).
-        output_dim: Target dimension for diffusion conditioning.
+        output_dim: Target dimension for diffusion conditioning (fallback).
     """
     self.per_modality_projections = per_modality_projections
     
     if per_modality_projections:
-      self.video_linear = nnx.Linear(input_dim, output_dim, use_bias=use_bias, dtype=dtype, rngs=rngs)
-      self.audio_linear = nnx.Linear(input_dim, output_dim, use_bias=use_bias, dtype=dtype, rngs=rngs)
+      v_dim = video_output_dim if video_output_dim is not None else output_dim
+      a_dim = audio_output_dim if audio_output_dim is not None else output_dim
+      self.video_linear = nnx.Linear(input_dim, v_dim, use_bias=use_bias, dtype=dtype, rngs=rngs)
+      self.audio_linear = nnx.Linear(input_dim, a_dim, use_bias=use_bias, dtype=dtype, rngs=rngs)
     else:
       self.linear = nnx.Linear(input_dim, output_dim, use_bias=use_bias, dtype=dtype, rngs=rngs)
 
diff --git a/src/maxdiffusion/models/ltx2/text_encoders/text_encoders_ltx2.py b/src/maxdiffusion/models/ltx2/text_encoders/text_encoders_ltx2.py
@@ -65,7 +65,8 @@ def __init__(
       audio_gated_attn: bool = False,
       **kwargs,
   ):
-    input_dim = caption_channels * text_proj_in_factor
+    gemma_dim = 3840 if video_caption_channels is not None else caption_channels
+    input_dim = gemma_dim * text_proj_in_factor
 
     v_dim = video_caption_channels if video_caption_channels is not None else caption_channels
     a_dim = audio_caption_channels if audio_caption_channels is not None else caption_channels
@@ -79,6 +80,8 @@ def __init__(
         rngs=rngs,
         per_modality_projections=per_modality_projections,
         use_bias=proj_bias,
+        video_output_dim=v_dim,
+        audio_output_dim=a_dim,
     )
 
     # Two independent connectors