hao-ai-lab · SolitaryThinker · Apr 14, 2026 · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026
diff --git a/docs/design/inference_schema_parity_inventory.yaml b/docs/design/inference_schema_parity_inventory.yaml
@@ -495,230 +495,14 @@ cli:
   notes:
     - "CLI parity is checked against the actual generate/serve parser dest sets."
     - "The inventory tracks parser dest names, excluding argparse's implicit help action."
+    - "The refactored inference CLI is config-only: subcommands expose only --config, and any additional CLI input must use dotted override paths."
   generate:
     explicit_local_fields:
       - config
     expected_dests:
-      - VSA_sparsity
-      - boundary_ratio
-      - bsa_cdf_threshold
-      - bsa_chunk_k
-      - bsa_chunk_q
-      - bsa_sparsity
       - config
-      - disable_autocast
-      - dist_timeout
-      - distributed_executor_backend
-      - dit_config.prefix
-      - dit_config.quant_config
-      - dit_cpu_offload
-      - dit_layerwise_offload
-      - dit_precision
-      - dmd_denoising_steps
-      - embedded_cfg_scale
-      - enable_bsa
-      - enable_stage_verification
-      - enable_torch_compile
-      - flow_shift
-      - fps
-      - guidance_rescale
-      - guidance_scale
-      - height
-      - hsdp_replicate_dim
-      - hsdp_shard_dim
-      - image_encoder_cpu_offload
-      - image_encoder_precision
-      - image_path
-      - inference_mode
-      - init_weights_from_safetensors
-      - init_weights_from_safetensors_2
-      - lora_nickname
-      - lora_path
-      - lora_target_modules
-      - ltx2_initial_latent_path
-      - ltx2_vae_spatial_tile_overlap_in_pixels
-      - ltx2_vae_spatial_tile_size_in_pixels
-      - ltx2_vae_temporal_tile_overlap_in_frames
-      - ltx2_vae_temporal_tile_size_in_frames
-      - ltx2_vae_tiling
-      - master_port
-      - moba_config_path
-      - mode
-      - model_path
-      - negative_prompt
-      - num_cond_frames
-      - num_frames
-      - num_gpus
-      - num_inference_steps
-      - num_videos_per_prompt
-      - output_path
-      - output_type
-      - output_video_name
-      - override_pipeline_cls_name
-      - override_text_encoder_quant
-      - override_text_encoder_safetensors
-      - override_transformer_cls_name
-      - pin_cpu_memory
-      - pipeline_config_path
-      - preprocess.dataloader_num_workers
-      - preprocess.dataset_output_dir
-      - preprocess.dataset_path
-      - preprocess.dataset_type
-      - preprocess.do_temporal_sample
-      - preprocess.drop_short_ratio
-      - preprocess.flush_frequency
-      - preprocess.max_height
-      - preprocess.max_width
-      - preprocess.model_path
-      - preprocess.num_frames
-      - preprocess.preprocess_video_batch_size
-      - preprocess.samples_per_file
-      - preprocess.seed
-      - preprocess.speed_factor
-      - preprocess.train_fps
-      - preprocess.training_cfg_rate
-      - preprocess.video_length_tolerance_range
-      - preprocess.video_loader_type
-      - preprocess.with_audio
-      - prompt
-      - prompt_path
-      - prompt_txt
-      - refine_from
-      - return_frames
-      - return_trajectory_decoded
-      - return_trajectory_latents
-      - revision
-      - save_video
-      - seed
-      - sp_size
-      - spatial_refine_only
-      - t_thresh
-      - text_encoder_configs
-      - text_encoder_cpu_offload
-      - text_encoder_precisions
-      - torch_compile_kwargs
-      - tp_size
-      - trust_remote_code
-      - use_fsdp_inference
-      - vae_config.blend_num_frames
-      - vae_config.load_decoder
-      - vae_config.load_encoder
-      - vae_config.tile_sample_min_height
-      - vae_config.tile_sample_min_num_frames
-      - vae_config.tile_sample_min_width
-      - vae_config.tile_sample_stride_height
-      - vae_config.tile_sample_stride_num_frames
-      - vae_config.tile_sample_stride_width
-      - vae_config.use_parallel_tiling
-      - vae_config.use_temporal_tiling
-      - vae_config.use_tiling
-      - vae_cpu_offload
-      - vae_precision
-      - vae_sp
-      - vae_tiling
-      - video_path
-      - width
-      - workload_type
   serve:
     explicit_local_fields:
       - config
-      - host
-      - output_dir
-      - port
     expected_dests:
-      - VSA_sparsity
-      - bsa_cdf_threshold
-      - bsa_chunk_k
-      - bsa_chunk_q
-      - bsa_sparsity
       - config
-      - disable_autocast
-      - dist_timeout
-      - distributed_executor_backend
-      - dit_config.prefix
-      - dit_config.quant_config
-      - dit_cpu_offload
-      - dit_layerwise_offload
-      - dit_precision
-      - dmd_denoising_steps
-      - embedded_cfg_scale
-      - enable_bsa
-      - enable_stage_verification
-      - enable_torch_compile
-      - flow_shift
-      - host
-      - hsdp_replicate_dim
-      - hsdp_shard_dim
-      - image_encoder_cpu_offload
-      - image_encoder_precision
-      - inference_mode
-      - init_weights_from_safetensors
-      - init_weights_from_safetensors_2
-      - lora_nickname
-      - lora_path
-      - lora_target_modules
-      - ltx2_initial_latent_path
-      - ltx2_vae_spatial_tile_overlap_in_pixels
-      - ltx2_vae_spatial_tile_size_in_pixels
-      - ltx2_vae_temporal_tile_overlap_in_frames
-      - ltx2_vae_temporal_tile_size_in_frames
-      - ltx2_vae_tiling
-      - master_port
-      - mode
-      - model_path
-      - num_gpus
-      - output_dir
-      - output_type
-      - override_pipeline_cls_name
-      - override_text_encoder_quant
-      - override_text_encoder_safetensors
-      - override_transformer_cls_name
-      - pin_cpu_memory
-      - pipeline_config_path
-      - port
-      - preprocess.dataloader_num_workers
-      - preprocess.dataset_output_dir
-      - preprocess.dataset_path
-      - preprocess.dataset_type
-      - preprocess.do_temporal_sample
-      - preprocess.drop_short_ratio
-      - preprocess.flush_frequency
-      - preprocess.max_height
-      - preprocess.max_width
-      - preprocess.model_path
-      - preprocess.num_frames
-      - preprocess.preprocess_video_batch_size
-      - preprocess.samples_per_file
-      - preprocess.seed
-      - preprocess.speed_factor
-      - preprocess.train_fps
-      - preprocess.training_cfg_rate
-      - preprocess.video_length_tolerance_range
-      - preprocess.video_loader_type
-      - preprocess.with_audio
-      - prompt_txt
-      - revision
-      - sp_size
-      - text_encoder_cpu_offload
-      - text_encoder_precisions
-      - torch_compile_kwargs
-      - tp_size
-      - trust_remote_code
-      - use_fsdp_inference
-      - vae_config.blend_num_frames
-      - vae_config.load_decoder
-      - vae_config.load_encoder
-      - vae_config.tile_sample_min_height
-      - vae_config.tile_sample_min_num_frames
-      - vae_config.tile_sample_min_width
-      - vae_config.tile_sample_stride_height
-      - vae_config.tile_sample_stride_num_frames
-      - vae_config.tile_sample_stride_width
-      - vae_config.use_parallel_tiling
-      - vae_config.use_temporal_tiling
-      - vae_config.use_tiling
-      - vae_cpu_offload
-      - vae_precision
-      - vae_sp
-      - vae_tiling
-      - workload_type
diff --git a/docs/distillation/dmd.md b/docs/distillation/dmd.md
@@ -16,7 +16,8 @@ Both models are trained on **61×448×832** resolution but support generating vi
 First install [VSA](../attention/vsa/index.md). Set `MODEL_BASE` to your own model path and run:
 
 ```bash
-bash scripts/inference/v1_inference_wan_dmd.sh
+FASTVIDEO_ATTENTION_BACKEND=VIDEO_SPARSE_ATTN \
+  fastvideo generate --config scripts/inference/inference_wan_VSA_DMD_1_3B.yaml
 ```
 
 ## 🗂️ Dataset

diff --git a/docs/inference/architecture.md b/docs/inference/architecture.md
@@ -455,6 +455,6 @@ User: generator.generate_video(prompt, ...)
    `fastvideo/pipelines/stages/`, implement `forward()`, optionally
    implement `verify_input()`/`verify_output()`.
 
-7. **Verify** — Run `fastvideo generate --model-path <path> --prompt
-   "test" --num-inference-steps 2` to confirm the pipeline loads and
-   generates output.
+7. **Verify** — Run `fastvideo generate --config <config.yaml>` with a
+   minimal nested config to confirm the pipeline loads and generates
+   output.