@@ -703,3 +703,98 @@ def translate_wan_nnx_path_to_diffusers_lora(nnx_path_str, scan_layers=False):
703703 return f"diffusion_model.blocks.{ idx } .{ suffix_map [inner_suffix ]} "
704704
705705 return None
706+
707+
708+ def translate_ltx2_nnx_path_to_diffusers_lora (nnx_path_str , scan_layers = False ):
709+ """
710+ Translates LTX2 NNX path to Diffusers/LoRA keys.
711+ """
712+ # --- 2. Map NNX Suffixes to LoRA Suffixes ---
713+ suffix_map = {
714+ # Self Attention (attn1)
715+ "attn1.to_q" : "attn1.to_q" ,
716+ "attn1.to_k" : "attn1.to_k" ,
717+ "attn1.to_v" : "attn1.to_v" ,
718+ "attn1.to_out" : "attn1.to_out.0" ,
719+ # Audio Self Attention (audio_attn1)
720+ "audio_attn1.to_q" : "audio_attn1.to_q" ,
721+ "audio_attn1.to_k" : "audio_attn1.to_k" ,
722+ "audio_attn1.to_v" : "audio_attn1.to_v" ,
723+ "audio_attn1.to_out" : "audio_attn1.to_out.0" ,
724+ # Audio Cross Attention (audio_attn2)
725+ "audio_attn2.to_q" : "audio_attn2.to_q" ,
726+ "audio_attn2.to_k" : "audio_attn2.to_k" ,
727+ "audio_attn2.to_v" : "audio_attn2.to_v" ,
728+ "audio_attn2.to_out" : "audio_attn2.to_out.0" ,
729+ # Cross Attention (attn2)
730+ "attn2.to_q" : "attn2.to_q" ,
731+ "attn2.to_k" : "attn2.to_k" ,
732+ "attn2.to_v" : "attn2.to_v" ,
733+ "attn2.to_out" : "attn2.to_out.0" ,
734+ # Audio to Video Cross Attention
735+ "audio_to_video_attn.to_q" : "audio_to_video_attn.to_q" ,
736+ "audio_to_video_attn.to_k" : "audio_to_video_attn.to_k" ,
737+ "audio_to_video_attn.to_v" : "audio_to_video_attn.to_v" ,
738+ "audio_to_video_attn.to_out" : "audio_to_video_attn.to_out.0" ,
739+ # Video to Audio Cross Attention
740+ "video_to_audio_attn.to_q" : "video_to_audio_attn.to_q" ,
741+ "video_to_audio_attn.to_k" : "video_to_audio_attn.to_k" ,
742+ "video_to_audio_attn.to_v" : "video_to_audio_attn.to_v" ,
743+ "video_to_audio_attn.to_out" : "video_to_audio_attn.to_out.0" ,
744+ # Feed Forward
745+ "ff.net_0" : "ff.net.0.proj" ,
746+ "ff.net_2" : "ff.net.2" ,
747+ # Audio Feed Forward
748+ "audio_ff.net_0" : "audio_ff.net.0.proj" ,
749+ "audio_ff.net_2" : "audio_ff.net.2" ,
750+ }
751+
752+ # --- 3. Translation Logic ---
753+ global_map = {
754+ "proj_in" : "diffusion_model.patchify_proj" ,
755+ "audio_proj_in" : "diffusion_model.audio_patchify_proj" ,
756+ "proj_out" : "diffusion_model.proj_out" ,
757+ "audio_proj_out" : "diffusion_model.audio_proj_out" ,
758+ "time_embed.linear" : "diffusion_model.adaln_single.linear" ,
759+ "audio_time_embed.linear" : "diffusion_model.audio_adaln_single.linear" ,
760+ "av_cross_attn_video_a2v_gate.linear" : "diffusion_model.av_ca_a2v_gate_adaln_single.linear" ,
761+ "av_cross_attn_audio_v2a_gate.linear" : "diffusion_model.av_ca_v2a_gate_adaln_single.linear" ,
762+ "av_cross_attn_audio_scale_shift.linear" : "diffusion_model.av_ca_audio_scale_shift_adaln_single.linear" ,
763+ "av_cross_attn_video_scale_shift.linear" : "diffusion_model.av_ca_video_scale_shift_adaln_single.linear" ,
764+ # Nested conditioning layers
765+ "time_embed.emb.timestep_embedder.linear_1" : "diffusion_model.adaln_single.emb.timestep_embedder.linear_1" ,
766+ "time_embed.emb.timestep_embedder.linear_2" : "diffusion_model.adaln_single.emb.timestep_embedder.linear_2" ,
767+ "audio_time_embed.emb.timestep_embedder.linear_1" : "diffusion_model.audio_adaln_single.emb.timestep_embedder.linear_1" ,
768+ "audio_time_embed.emb.timestep_embedder.linear_2" : "diffusion_model.audio_adaln_single.emb.timestep_embedder.linear_2" ,
769+ "av_cross_attn_video_scale_shift.emb.timestep_embedder.linear_1" : "diffusion_model.av_ca_video_scale_shift_adaln_single.emb.timestep_embedder.linear_1" ,
770+ "av_cross_attn_video_scale_shift.emb.timestep_embedder.linear_2" : "diffusion_model.av_ca_video_scale_shift_adaln_single.emb.timestep_embedder.linear_2" ,
771+ "av_cross_attn_audio_scale_shift.emb.timestep_embedder.linear_1" : "diffusion_model.av_ca_audio_scale_shift_adaln_single.emb.timestep_embedder.linear_1" ,
772+ "av_cross_attn_audio_scale_shift.emb.timestep_embedder.linear_2" : "diffusion_model.av_ca_audio_scale_shift_adaln_single.emb.timestep_embedder.linear_2" ,
773+ "av_cross_attn_video_a2v_gate.emb.timestep_embedder.linear_1" : "diffusion_model.av_ca_a2v_gate_adaln_single.emb.timestep_embedder.linear_1" ,
774+ "av_cross_attn_video_a2v_gate.emb.timestep_embedder.linear_2" : "diffusion_model.av_ca_a2v_gate_adaln_single.emb.timestep_embedder.linear_2" ,
775+ "av_cross_attn_audio_v2a_gate.emb.timestep_embedder.linear_1" : "diffusion_model.av_ca_v2a_gate_adaln_single.emb.timestep_embedder.linear_1" ,
776+ "av_cross_attn_audio_v2a_gate.emb.timestep_embedder.linear_2" : "diffusion_model.av_ca_v2a_gate_adaln_single.emb.timestep_embedder.linear_2" ,
777+ "caption_projection.linear_1" : "diffusion_model.caption_projection.linear_1" ,
778+ "caption_projection.linear_2" : "diffusion_model.caption_projection.linear_2" ,
779+ "audio_caption_projection.linear_1" : "diffusion_model.audio_caption_projection.linear_1" ,
780+ "audio_caption_projection.linear_2" : "diffusion_model.audio_caption_projection.linear_2" ,
781+ # Connectors
782+ "feature_extractor.linear" : "text_embedding_projection.aggregate_embed" ,
783+ }
784+
785+ if nnx_path_str in global_map :
786+ return global_map [nnx_path_str ]
787+
788+ if scan_layers :
789+ if nnx_path_str .startswith ("transformer_blocks." ):
790+ inner_suffix = nnx_path_str [len ("transformer_blocks." ) :]
791+ if inner_suffix in suffix_map :
792+ return f"diffusion_model.transformer_blocks.{{}}.{ suffix_map [inner_suffix ]} "
793+ else :
794+ m = re .match (r"^transformer_blocks\.(\d+)\.(.+)$" , nnx_path_str )
795+ if m :
796+ idx , inner_suffix = m .group (1 ), m .group (2 )
797+ if inner_suffix in suffix_map :
798+ return f"diffusion_model.transformer_blocks.{ idx } .{ suffix_map [inner_suffix ]} "
799+
800+ return None
0 commit comments