diff --git a/docs/design/inference_schema_parity_inventory.yaml b/docs/design/inference_schema_parity_inventory.yaml
index f4a6a66811..5f573a1fe9 100644
--- a/docs/design/inference_schema_parity_inventory.yaml
+++ b/docs/design/inference_schema_parity_inventory.yaml
@@ -495,230 +495,14 @@ cli:
   notes:
     - "CLI parity is checked against the actual generate/serve parser dest sets."
     - "The inventory tracks parser dest names, excluding argparse's implicit help action."
+    - "The refactored inference CLI is config-only: subcommands expose only --config, and any additional CLI input must use dotted override paths."
   generate:
     explicit_local_fields:
       - config
     expected_dests:
-      - VSA_sparsity
-      - boundary_ratio
-      - bsa_cdf_threshold
-      - bsa_chunk_k
-      - bsa_chunk_q
-      - bsa_sparsity
       - config
-      - disable_autocast
-      - dist_timeout
-      - distributed_executor_backend
-      - dit_config.prefix
-      - dit_config.quant_config
-      - dit_cpu_offload
-      - dit_layerwise_offload
-      - dit_precision
-      - dmd_denoising_steps
-      - embedded_cfg_scale
-      - enable_bsa
-      - enable_stage_verification
-      - enable_torch_compile
-      - flow_shift
-      - fps
-      - guidance_rescale
-      - guidance_scale
-      - height
-      - hsdp_replicate_dim
-      - hsdp_shard_dim
-      - image_encoder_cpu_offload
-      - image_encoder_precision
-      - image_path
-      - inference_mode
-      - init_weights_from_safetensors
-      - init_weights_from_safetensors_2
-      - lora_nickname
-      - lora_path
-      - lora_target_modules
-      - ltx2_initial_latent_path
-      - ltx2_vae_spatial_tile_overlap_in_pixels
-      - ltx2_vae_spatial_tile_size_in_pixels
-      - ltx2_vae_temporal_tile_overlap_in_frames
-      - ltx2_vae_temporal_tile_size_in_frames
-      - ltx2_vae_tiling
-      - master_port
-      - moba_config_path
-      - mode
-      - model_path
-      - negative_prompt
-      - num_cond_frames
-      - num_frames
-      - num_gpus
-      - num_inference_steps
-      - num_videos_per_prompt
-      - output_path
-      - output_type
-      - output_video_name
-      - override_pipeline_cls_name
-      - override_text_encoder_quant
-      - override_text_encoder_safetensors
-      - override_transformer_cls_name
-      - pin_cpu_memory
-      - pipeline_config_path
-      - preprocess.dataloader_num_workers
-      - preprocess.dataset_output_dir
-      - preprocess.dataset_path
-      - preprocess.dataset_type
-      - preprocess.do_temporal_sample
-      - preprocess.drop_short_ratio
-      - preprocess.flush_frequency
-      - preprocess.max_height
-      - preprocess.max_width
-      - preprocess.model_path
-      - preprocess.num_frames
-      - preprocess.preprocess_video_batch_size
-      - preprocess.samples_per_file
-      - preprocess.seed
-      - preprocess.speed_factor
-      - preprocess.train_fps
-      - preprocess.training_cfg_rate
-      - preprocess.video_length_tolerance_range
-      - preprocess.video_loader_type
-      - preprocess.with_audio
-      - prompt
-      - prompt_path
-      - prompt_txt
-      - refine_from
-      - return_frames
-      - return_trajectory_decoded
-      - return_trajectory_latents
-      - revision
-      - save_video
-      - seed
-      - sp_size
-      - spatial_refine_only
-      - t_thresh
-      - text_encoder_configs
-      - text_encoder_cpu_offload
-      - text_encoder_precisions
-      - torch_compile_kwargs
-      - tp_size
-      - trust_remote_code
-      - use_fsdp_inference
-      - vae_config.blend_num_frames
-      - vae_config.load_decoder
-      - vae_config.load_encoder
-      - vae_config.tile_sample_min_height
-      - vae_config.tile_sample_min_num_frames
-      - vae_config.tile_sample_min_width
-      - vae_config.tile_sample_stride_height
-      - vae_config.tile_sample_stride_num_frames
-      - vae_config.tile_sample_stride_width
-      - vae_config.use_parallel_tiling
-      - vae_config.use_temporal_tiling
-      - vae_config.use_tiling
-      - vae_cpu_offload
-      - vae_precision
-      - vae_sp
-      - vae_tiling
-      - video_path
-      - width
-      - workload_type
   serve:
     explicit_local_fields:
       - config
-      - host
-      - output_dir
-      - port
     expected_dests:
-      - VSA_sparsity
-      - bsa_cdf_threshold
-      - bsa_chunk_k
-      - bsa_chunk_q
-      - bsa_sparsity
       - config
-      - disable_autocast
-      - dist_timeout
-      - distributed_executor_backend
-      - dit_config.prefix
-      - dit_config.quant_config
-      - dit_cpu_offload
-      - dit_layerwise_offload
-      - dit_precision
-      - dmd_denoising_steps
-      - embedded_cfg_scale
-      - enable_bsa
-      - enable_stage_verification
-      - enable_torch_compile
-      - flow_shift
-      - host
-      - hsdp_replicate_dim
-      - hsdp_shard_dim
-      - image_encoder_cpu_offload
-      - image_encoder_precision
-      - inference_mode
-      - init_weights_from_safetensors
-      - init_weights_from_safetensors_2
-      - lora_nickname
-      - lora_path
-      - lora_target_modules
-      - ltx2_initial_latent_path
-      - ltx2_vae_spatial_tile_overlap_in_pixels
-      - ltx2_vae_spatial_tile_size_in_pixels
-      - ltx2_vae_temporal_tile_overlap_in_frames
-      - ltx2_vae_temporal_tile_size_in_frames
-      - ltx2_vae_tiling
-      - master_port
-      - mode
-      - model_path
-      - num_gpus
-      - output_dir
-      - output_type
-      - override_pipeline_cls_name
-      - override_text_encoder_quant
-      - override_text_encoder_safetensors
-      - override_transformer_cls_name
-      - pin_cpu_memory
-      - pipeline_config_path
-      - port
-      - preprocess.dataloader_num_workers
-      - preprocess.dataset_output_dir
-      - preprocess.dataset_path
-      - preprocess.dataset_type
-      - preprocess.do_temporal_sample
-      - preprocess.drop_short_ratio
-      - preprocess.flush_frequency
-      - preprocess.max_height
-      - preprocess.max_width
-      - preprocess.model_path
-      - preprocess.num_frames
-      - preprocess.preprocess_video_batch_size
-      - preprocess.samples_per_file
-      - preprocess.seed
-      - preprocess.speed_factor
-      - preprocess.train_fps
-      - preprocess.training_cfg_rate
-      - preprocess.video_length_tolerance_range
-      - preprocess.video_loader_type
-      - preprocess.with_audio
-      - prompt_txt
-      - revision
-      - sp_size
-      - text_encoder_cpu_offload
-      - text_encoder_precisions
-      - torch_compile_kwargs
-      - tp_size
-      - trust_remote_code
-      - use_fsdp_inference
-      - vae_config.blend_num_frames
-      - vae_config.load_decoder
-      - vae_config.load_encoder
-      - vae_config.tile_sample_min_height
-      - vae_config.tile_sample_min_num_frames
-      - vae_config.tile_sample_min_width
-      - vae_config.tile_sample_stride_height
-      - vae_config.tile_sample_stride_num_frames
-      - vae_config.tile_sample_stride_width
-      - vae_config.use_parallel_tiling
-      - vae_config.use_temporal_tiling
-      - vae_config.use_tiling
-      - vae_cpu_offload
-      - vae_precision
-      - vae_sp
-      - vae_tiling
-      - workload_type
diff --git a/docs/distillation/dmd.md b/docs/distillation/dmd.md
index c0bee0586a..6608ec029c 100644
--- a/docs/distillation/dmd.md
+++ b/docs/distillation/dmd.md
@@ -16,7 +16,8 @@ Both models are trained on **61×448×832** resolution but support generating vi
 First install [VSA](../attention/vsa/index.md). Set `MODEL_BASE` to your own model path and run:
 
 ```bash
-bash scripts/inference/v1_inference_wan_dmd.sh
+FASTVIDEO_ATTENTION_BACKEND=VIDEO_SPARSE_ATTN \
+  fastvideo generate --config scripts/inference/inference_wan_VSA_DMD_1_3B.yaml
 ```
 
 ## 🗂️ Dataset
diff --git a/docs/inference/architecture.md b/docs/inference/architecture.md
index bdfd1e8c33..d36bf8412d 100644
--- a/docs/inference/architecture.md
+++ b/docs/inference/architecture.md
@@ -455,6 +455,6 @@ User: generator.generate_video(prompt, ...)
    `fastvideo/pipelines/stages/`, implement `forward()`, optionally
    implement `verify_input()`/`verify_output()`.
 
-7. **Verify** — Run `fastvideo generate --model-path <path> --prompt
-   "test" --num-inference-steps 2` to confirm the pipeline loads and
-   generates output.
+7. **Verify** — Run `fastvideo generate --config <config.yaml>` with a
+   minimal nested config to confirm the pipeline loads and generates
+   output.
diff --git a/docs/inference/cli.md b/docs/inference/cli.md
index 991b9be98b..27fc992522 100644
--- a/docs/inference/cli.md
+++ b/docs/inference/cli.md
@@ -1,71 +1,29 @@
 # FastVideo CLI Inference
 
-The FastVideo CLI exposes the same core inference controls as the Python API.
+The FastVideo CLI is config-first. Inference runs are driven by a nested JSON or
+YAML config, with optional dotted-path overrides on the command line. The
+contract matches training: use an explicit subcommand plus `--config`, then add
+any dotted overrides you need.
 
 ## Basic Usage
 
-Use either:
-
-1. `--model-path` + `--prompt`
-2. `--model-path` + `--prompt-txt` (batch prompts, one line per prompt)
-3. `--config` (JSON/YAML)
-
-```bash
-fastvideo generate --model-path Wan-AI/Wan2.1-T2V-1.3B-Diffusers \
-  --prompt "A cat playing with a ball of yarn"
-```
-
 ```bash
-fastvideo generate --model-path Wan-AI/Wan2.1-T2V-1.3B-Diffusers \
-  --prompt-txt prompts.txt
+fastvideo generate --config config.yaml
+fastvideo serve --config serve.yaml
 ```
 
-You cannot provide both `--prompt` and `--prompt-txt` in the same run.
-
 ## View All Arguments
 
 ```bash
 fastvideo generate --help
 ```
 
-Arguments come from:
-
-- FastVideo runtime args (`FastVideoArgs`)
-- Sampling args (`SamplingParam`)
-- Pipeline config args (`PipelineConfig`)
-
-## Common Arguments
-
-### Parallelism
-
-- `--num-gpus`
-- `--sp-size`
-- `--tp-size`
+The subcommands intentionally expose only `--config`. Any per-run CLI changes
+must use dotted override paths such as:
 
-### Sampling
-
-- `--num-frames`
-- `--height` / `--width`
-- `--num-inference-steps`
-- `--guidance-scale`
-- `--seed`
-- `--negative-prompt`
-
-### Output
-
-- `--output-path`
-- `--save-video` / `--no-save-video`
-- `--return-frames`
-
-### Offloading and Performance
-
-- `--dit-layerwise-offload`
-- `--use-fsdp-inference`
-- `--text-encoder-cpu-offload`
-- `--image-encoder-cpu-offload`
-- `--vae-cpu-offload`
-- `--enable-torch-compile`
-- `--torch-compile-kwargs`
+- `--generator.engine.num_gpus 2`
+- `--request.sampling.seed 42`
+- `--server.port 9000`
 
 ## Using Config Files
 
@@ -73,50 +31,53 @@ Arguments come from:
 fastvideo generate --config config.yaml
 ```
 
-Config files can be JSON or YAML. CLI flags override config-file values.
+Config files can be JSON or YAML. Dotted CLI overrides take precedence over
+config-file values.
 
 Example `config.yaml`:
 
 ```yaml
-model_path: "FastVideo/FastHunyuan-diffusers"
-prompt: "A capybara lounging in a hammock"
-output_path: "outputs/"
-num_gpus: 2
-sp_size: 2
-tp_size: 1
-num_frames: 45
-height: 720
-width: 1280
-num_inference_steps: 6
-seed: 1024
-dit_precision: "bf16"
-vae_precision: "fp16"
-vae_tiling: true
-vae_sp: true
-enable_torch_compile: false
+generator:
+  model_path: FastVideo/FastHunyuan-diffusers
+  engine:
+    num_gpus: 2
+    parallelism:
+      sp_size: 2
+      tp_size: 1
+request:
+  prompt: A capybara lounging in a hammock
+  sampling:
+    num_frames: 45
+    height: 720
+    width: 1280
+    num_inference_steps: 6
+    seed: 1024
+  output:
+    output_path: outputs/
 ```
 
 Notes:
 
-- Use `dit_precision` / `vae_precision` (not `precision`).
-- Nested config objects are supported, for example `vae_config` and
-  `dit_config`.
+- `generator` and `request` are the top-level keys for generation configs.
+- `serve` configs use `generator`, `server`, and optional `default_request`.
+- Prompt text files belong under `request.inputs.prompt_path`.
 
 ## Examples
 
 Simple generation:
 
 ```bash
-fastvideo generate \
-  --model-path FastVideo/FastHunyuan-diffusers \
-  --prompt "A cat playing with a ball of yarn" \
-  --num-frames 45 --height 720 --width 1280 \
-  --num-inference-steps 6 --seed 1024 \
-  --output-path outputs/
+fastvideo generate --config config.yaml
+```
+
+Config + dotted override:
+
+```bash
+fastvideo generate --config config.yaml --request.prompt "A panda skiing at sunset"
 ```
 
-Config + CLI override:
+Helper wrapper with positional config path:
 
 ```bash
-fastvideo generate --config config.yaml --prompt "A panda skiing at sunset"
+bash scripts/inference/run.sh scripts/inference/inference_wan.yaml
 ```
diff --git a/docs/inference/configuration.md b/docs/inference/configuration.md
index e559ff000e..2fdfe17fd1 100644
--- a/docs/inference/configuration.md
+++ b/docs/inference/configuration.md
@@ -73,32 +73,40 @@ if __name__ == '__main__':
 
 ## JSON/YAML Config Files (CLI)
 
-The CLI supports `--config` with JSON or YAML. Command-line arguments override
-config file values.
-By default, `fastvideo generate` uses `return_frames=false` unless you set
-`--return-frames` (or `return_frames: true` in config).
+The inference CLI is config-first. Use an explicit subcommand with `--config`,
+then apply optional dotted overrides on top, matching the training CLI style.
+By default, CLI generation uses `return_frames=false` unless you set
+`request.output.return_frames: true` in config or via a dotted override.
 
 ```bash
 fastvideo generate --config config.yaml
 ```
 
-Use CLI argument names as keys (underscore or hyphen is accepted). Example:
+Example nested config:
 
 ```yaml
-model_path: "FastVideo/FastHunyuan-diffusers"
-prompt: "A capybara relaxing in a hammock"
-num_gpus: 2
-sp_size: 2
-num_frames: 45
-height: 720
-width: 1280
-num_inference_steps: 6
-seed: 1024
-dit_precision: "bf16"
-vae_precision: "fp16"
-vae_tiling: true
-vae_sp: true
-enable_torch_compile: false
+generator:
+  model_path: FastVideo/FastHunyuan-diffusers
+  engine:
+    num_gpus: 2
+    parallelism:
+      sp_size: 2
+request:
+  prompt: A capybara relaxing in a hammock
+  sampling:
+    num_frames: 45
+    height: 720
+    width: 1280
+    num_inference_steps: 6
+    seed: 1024
+  output:
+    output_path: outputs/
+```
+
+Override individual values from the CLI with dotted paths:
+
+```bash
+fastvideo generate --config config.yaml --request.sampling.seed 42
 ```
 
 ## Performance Optimization
diff --git a/fastvideo-kernel/build.sh b/fastvideo-kernel/build.sh
index 1079523754..43e31b8642 100755
--- a/fastvideo-kernel/build.sh
+++ b/fastvideo-kernel/build.sh
@@ -10,6 +10,35 @@ set -ex
 
 echo "Building fastvideo-kernel..."
 
+# ---------------------------------------------------------------------------
+# Neutralise conda-injected compiler toolchains.
+#
+# Conda compiler packages (gcc_linux-aarch64, gxx_linux-64, etc.) set
+# CMAKE_ARGS, CFLAGS, CXXFLAGS, and LDFLAGS on activation.  When multiple
+# toolchains are installed the variables can reference a *cross*-compiler
+# that doesn't match the host (e.g. aarch64-conda-linux-gnu-c++ on x86_64).
+# Even when the correct toolchain is active, the flags it injects
+# (-march=nocona, -mtune=haswell, …) can conflict with nvcc's host-compiler
+# expectations.  Clear them so CMake discovers the system compiler instead.
+# ---------------------------------------------------------------------------
+if [[ -n "${CONDA_PREFIX:-}" ]]; then
+    _need_clean=0
+    # Detect conda cross-compiler that doesn't match the host.
+    _host_arch="$(uname -m)"
+    if [[ "${CXX:-}" == *"conda"* ]] || [[ "${CC:-}" == *"conda"* ]]; then
+        _need_clean=1
+    fi
+    if [[ "${CMAKE_ARGS:-}" == *"conda"* ]]; then
+        _need_clean=1
+    fi
+    if (( _need_clean )); then
+        echo "NOTE: Clearing conda-injected compiler settings (CC/CXX/CMAKE_ARGS/CFLAGS/...)"
+        echo "      to use the system compiler for CUDA extension builds."
+        unset CC CXX CMAKE_ARGS CFLAGS CXXFLAGS LDFLAGS
+    fi
+    unset _need_clean _host_arch
+fi
+
 # Ensure submodules are initialized if needed (tk)
 git submodule update --init --recursive
 
diff --git a/fastvideo/api/compat.py b/fastvideo/api/compat.py
index fefb749768..51ea864138 100644
--- a/fastvideo/api/compat.py
+++ b/fastvideo/api/compat.py
@@ -9,6 +9,11 @@
 
 from fastvideo.api.overrides import apply_overrides, parse_cli_overrides
 from fastvideo.api.parser import config_to_dict, load_raw_config, parse_config
+from fastvideo.api.request_metadata import (
+    EXPLICIT_REQUEST_ATTR,
+    bind_generation_request_raw,
+    refresh_generation_request_raw,
+)
 from fastvideo.api.schema import (
     GenerationRequest,
     GeneratorConfig,
@@ -21,7 +26,6 @@
 from fastvideo.fastvideo_args import FastVideoArgs
 from fastvideo.utils import shallow_asdict
 
-_EXPLICIT_REQUEST_ATTR = "_fastvideo_explicit_request"
 _INPUT_FIELD_NAMES = {field.name for field in fields(InputConfig)}
 _SAMPLING_FIELD_NAMES = {field.name for field in fields(SamplingConfig)}
 _RUNTIME_FIELD_NAMES = {field.name for field in fields(RequestRuntimeConfig)}
@@ -224,8 +228,10 @@ def generator_config_to_fastvideo_args(config: GeneratorConfig | Mapping[str, An
 def normalize_generation_request(request: GenerationRequest | Mapping[str, Any], ) -> GenerationRequest:
     normalized = (request if isinstance(request, GenerationRequest) else parse_config(GenerationRequest, request))
 
-    if not hasattr(normalized, _EXPLICIT_REQUEST_ATTR):
-        setattr(normalized, _EXPLICIT_REQUEST_ATTR, _serialize_generation_request(normalized))
+    if hasattr(normalized, EXPLICIT_REQUEST_ATTR):
+        refresh_generation_request_raw(normalized)
+    else:
+        bind_generation_request_raw(normalized, _serialize_generation_request(normalized))
     return normalized
 
 
@@ -253,7 +259,7 @@ def legacy_generate_call_to_request(
         raw.setdefault("inputs", {})["grid_sizes"] = grid_sizes
 
     normalized = parse_config(GenerationRequest, raw)
-    setattr(normalized, _EXPLICIT_REQUEST_ATTR, deepcopy(raw))
+    bind_generation_request_raw(normalized, raw)
     return normalized
 
 
@@ -355,7 +361,7 @@ def request_to_pipeline_overrides(request: GenerationRequest) -> dict[str, Any]:
 
 
 def _explicit_request_updates(request: GenerationRequest) -> dict[str, Any]:
-    raw = getattr(request, _EXPLICIT_REQUEST_ATTR, None)
+    raw = getattr(request, EXPLICIT_REQUEST_ATTR, None)
     if raw is None:
         raw = _serialize_generation_request(request)
 
@@ -424,7 +430,7 @@ def _fan_out_explicit_request_metadata(
     index: int,
     prompt: str,
 ) -> None:
-    raw = getattr(source_request, _EXPLICIT_REQUEST_ATTR, None)
+    raw = getattr(source_request, EXPLICIT_REQUEST_ATTR, None)
     if raw is None:
         return
 
@@ -438,7 +444,7 @@ def _fan_out_explicit_request_metadata(
                 _validate_batched_input_length(source_request.prompt, value, field_name)
                 inputs[field_name] = deepcopy(value[index])
 
-    setattr(target_request, _EXPLICIT_REQUEST_ATTR, raw)
+    setattr(target_request, EXPLICIT_REQUEST_ATTR, raw)
 
 
 def _validate_batched_input_length(
diff --git a/fastvideo/api/overrides.py b/fastvideo/api/overrides.py
index 8691d4f56b..3eaca58321 100644
--- a/fastvideo/api/overrides.py
+++ b/fastvideo/api/overrides.py
@@ -31,7 +31,7 @@ def parse_cli_overrides(overrides: list[str]) -> dict[str, Any]:
                 raise ValueError(f"Missing value for override {token!r}")
             raw_value = overrides[index]
 
-        parsed[key] = _cast_override_value(raw_value)
+        parsed[_normalize_override_key(key)] = _cast_override_value(raw_value)
         index += 1
 
     return parsed
@@ -94,4 +94,8 @@ def _cast_override_value(raw: str) -> Any:
     return raw
 
 
+def _normalize_override_key(key: str) -> str:
+    return key.replace("-", "_")
+
+
 __all__ = ["apply_overrides", "parse_cli_overrides"]
diff --git a/fastvideo/api/parser.py b/fastvideo/api/parser.py
index b96a244151..8da9946655 100644
--- a/fastvideo/api/parser.py
+++ b/fastvideo/api/parser.py
@@ -12,7 +12,12 @@
 
 from fastvideo.api.errors import ConfigValidationError
 from fastvideo.api.overrides import apply_overrides, parse_cli_overrides
-from fastvideo.api.schema import RunConfig, ServeConfig
+from fastvideo.api.request_metadata import (
+    bind_generation_request_raw,
+    bind_run_config_raw,
+    bind_serve_config_raw,
+)
+from fastvideo.api.schema import GenerationRequest, RunConfig, ServeConfig
 
 T = TypeVar("T")
 _UNION_ORIGINS = {types.UnionType, Union}
@@ -31,7 +36,14 @@ def parse_config(config_type: type[T], raw: Mapping[str, Any] | T) -> T:
         return raw
     if not isinstance(raw, Mapping):
         raise ConfigValidationError("", f"expected mapping for {config_type.__name__}")
-    return _SchemaParser().parse_dataclass(config_type, raw, "")
+    parsed = _SchemaParser().parse_dataclass(config_type, raw, "")
+    if config_type is GenerationRequest:
+        return bind_generation_request_raw(parsed, raw)
+    if config_type is RunConfig:
+        return bind_run_config_raw(parsed, raw)
+    if config_type is ServeConfig:
+        return bind_serve_config_raw(parsed, raw)
+    return parsed
 
 
 def config_to_dict(config: Any) -> Any:
diff --git a/fastvideo/api/request_metadata.py b/fastvideo/api/request_metadata.py
new file mode 100644
index 0000000000..eb241bf6c1
--- /dev/null
+++ b/fastvideo/api/request_metadata.py
@@ -0,0 +1,247 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Track which GenerationRequest fields the user explicitly provided.
+
+This module solves a specific problem: when translating a GenerationRequest into
+a legacy SamplingParam, we need to distinguish user-provided values (which
+should override model defaults) from schema defaults (which should NOT override
+model defaults).
+
+The approach:
+1. At bind time, store the original raw dict and a baseline snapshot.
+2. Patch __setattr__ on tracked dataclass types to record dirty field paths.
+3. At access time, do a lazy 3-way merge: raw + baseline + current state,
+   with dirty paths forcing inclusion even when current == baseline.
+"""
+from __future__ import annotations
+
+from collections.abc import Mapping
+from copy import deepcopy
+import dataclasses
+from typing import Any, cast
+from collections.abc import Callable
+
+from fastvideo.api.schema import (
+    ContinuationState,
+    GenerationPlan,
+    GenerationRequest,
+    InputConfig,
+    OutputConfig,
+    PlannedStage,
+    RequestRuntimeConfig,
+    RunConfig,
+    SamplingConfig,
+    ServeConfig,
+)
+
+EXPLICIT_REQUEST_ATTR = "_fastvideo_explicit_request"
+ORIGINAL_REQUEST_STATE_ATTR = "_fastvideo_original_request_state"
+_TRACKING_ROOT_ATTR = "_fastvideo_request_tracking_root"
+_TRACKING_PATH_ATTR = "_fastvideo_request_tracking_path"
+_TRACKING_PATCHED_ATTR = "_fastvideo_request_tracking_patched"
+_DIRTY_PATHS_ATTR = "_fastvideo_dirty_paths"
+_TRACKED_REQUEST_TYPES = (
+    GenerationRequest,
+    InputConfig,
+    SamplingConfig,
+    RequestRuntimeConfig,
+    OutputConfig,
+    ContinuationState,
+    PlannedStage,
+    GenerationPlan,
+)
+
+
+def bind_generation_request_raw(
+    request: GenerationRequest,
+    raw: Mapping[str, Any] | None,
+) -> GenerationRequest:
+    _ensure_request_tracking()
+    # Disable dirty tracking during bind so tree walk doesn't record paths.
+    object.__setattr__(request, _DIRTY_PATHS_ATTR, None)
+    object.__setattr__(request, EXPLICIT_REQUEST_ATTR, deepcopy(dict(raw or {})))
+    object.__setattr__(request, ORIGINAL_REQUEST_STATE_ATTR, _serialize_config(request))
+    _set_tracking_roots(request, request, "")
+    # Enable dirty tracking.
+    object.__setattr__(request, _DIRTY_PATHS_ATTR, set())
+    return request
+
+
+def bind_run_config_raw(
+    config: RunConfig,
+    raw: Mapping[str, Any],
+) -> RunConfig:
+    request_raw = raw.get("request")
+    if isinstance(request_raw, Mapping):
+        bind_generation_request_raw(config.request, request_raw)
+    return config
+
+
+def bind_serve_config_raw(
+    config: ServeConfig,
+    raw: Mapping[str, Any],
+) -> ServeConfig:
+    default_request_raw = raw.get("default_request")
+    if isinstance(default_request_raw, Mapping):
+        bind_generation_request_raw(config.default_request, default_request_raw)
+    elif "default_request" not in raw:
+        bind_generation_request_raw(config.default_request, {})
+    return config
+
+
+def refresh_generation_request_raw(request: GenerationRequest, ) -> dict[str, Any] | None:
+    raw = getattr(request, EXPLICIT_REQUEST_ATTR, None)
+    baseline = getattr(request, ORIGINAL_REQUEST_STATE_ATTR, None)
+    if not isinstance(raw, Mapping) or not isinstance(baseline, Mapping):
+        return None
+
+    dirty = getattr(request, _DIRTY_PATHS_ATTR, None) or frozenset()
+    current = _serialize_config(request)
+    merged = deepcopy(dict(raw))
+    _merge_request_mutations(merged, dict(baseline), current, dirty)
+
+    object.__setattr__(request, EXPLICIT_REQUEST_ATTR, merged)
+    object.__setattr__(request, ORIGINAL_REQUEST_STATE_ATTR, current)
+    object.__setattr__(request, _DIRTY_PATHS_ATTR, set())
+    return merged
+
+
+# ---------------------------------------------------------------------------
+# 3-way merge: raw + baseline + current, with dirty-path forcing
+# ---------------------------------------------------------------------------
+
+_MISSING = object()
+
+
+def _merge_request_mutations(
+    merged: dict[str, Any],
+    baseline: Mapping[str, Any],
+    current: Mapping[str, Any],
+    dirty: frozenset[str] | set[str],
+    path_prefix: str = "",
+    force_dirty: bool = False,
+) -> None:
+    # Remove keys that were deleted from the current state.
+    for key in set(merged) | set(baseline):
+        if key not in current:
+            merged.pop(key, None)
+
+    for key in current:
+        current_path = f"{path_prefix}.{key}" if path_prefix else key
+        current_value = current[key]
+        baseline_value = baseline.get(key, _MISSING)
+        merged_value = merged.get(key, _MISSING)
+
+        # If this exact path was dirtied (e.g. whole section replaced),
+        # propagate to all children.
+        child_force = force_dirty or current_path in dirty
+
+        # Recurse into nested mappings.
+        if isinstance(current_value, Mapping) and isinstance(baseline_value, Mapping):
+            nested = (deepcopy(dict(merged_value)) if isinstance(merged_value, Mapping) else {})
+            _merge_request_mutations(
+                nested,
+                baseline_value,
+                current_value,
+                dirty,
+                current_path,
+                child_force,
+            )
+            if nested:
+                merged[key] = nested
+            else:
+                merged.pop(key, None)
+            continue
+
+        # A field is explicitly set if:
+        # - it's new (not in baseline),
+        # - it changed from baseline,
+        # - its path was touched by __setattr__ (dirty), or
+        # - an ancestor path was dirty (whole section replaced).
+        is_dirty = child_force or current_path in dirty
+        if baseline_value is _MISSING or current_value != baseline_value or is_dirty:
+            merged[key] = deepcopy(current_value)
+
+
+# ---------------------------------------------------------------------------
+# __setattr__ patching for dirty-path recording
+# ---------------------------------------------------------------------------
+
+
+def _ensure_request_tracking() -> None:
+    for config_type in _TRACKED_REQUEST_TYPES:
+        _patch_tracking_setattr(config_type)
+
+
+def _patch_tracking_setattr(config_type: type[Any]) -> None:
+    if getattr(config_type, _TRACKING_PATCHED_ATTR, False):
+        return
+
+    original_setattr = cast(
+        Callable[[Any, str, Any], None],
+        config_type.__setattr__,
+    )
+    field_names = {field.name for field in dataclasses.fields(config_type)}
+
+    def _tracking_setattr(self: Any, name: str, value: Any) -> None:
+        if name.startswith("_fastvideo_") or name not in field_names:
+            original_setattr(self, name, value)
+            return
+
+        root = getattr(self, _TRACKING_ROOT_ATTR, None)
+        if root is not None:
+            dirty = getattr(root, _DIRTY_PATHS_ATTR, None)
+            if isinstance(dirty, set):
+                prefix = getattr(self, _TRACKING_PATH_ATTR, "")
+                path = f"{prefix}.{name}" if prefix else name
+                dirty.add(path)
+
+        original_setattr(self, name, value)
+
+    type.__setattr__(config_type, "__setattr__", _tracking_setattr)
+    setattr(config_type, _TRACKING_PATCHED_ATTR, True)
+
+
+# ---------------------------------------------------------------------------
+# Tree walk to set tracking root/path on nested dataclasses
+# ---------------------------------------------------------------------------
+
+
+def _set_tracking_roots(
+    root: GenerationRequest,
+    obj: Any,
+    prefix: str,
+) -> None:
+    if not dataclasses.is_dataclass(obj) or isinstance(obj, type):
+        return
+    object.__setattr__(obj, _TRACKING_ROOT_ATTR, root)
+    object.__setattr__(obj, _TRACKING_PATH_ATTR, prefix)
+    for field in dataclasses.fields(obj):
+        child = getattr(obj, field.name)
+        child_path = f"{prefix}.{field.name}" if prefix else field.name
+        if dataclasses.is_dataclass(child) and not isinstance(child, type):
+            _set_tracking_roots(root, child, child_path)
+
+
+# ---------------------------------------------------------------------------
+# Serialization helper
+# ---------------------------------------------------------------------------
+
+
+def _serialize_config(config: Any) -> Any:
+    if dataclasses.is_dataclass(config) and not isinstance(config, type):
+        return {field.name: _serialize_config(getattr(config, field.name)) for field in dataclasses.fields(config)}
+    if isinstance(config, list):
+        return [_serialize_config(item) for item in config]
+    if isinstance(config, dict):
+        return {key: _serialize_config(value) for key, value in config.items()}
+    return deepcopy(config)
+
+
+__all__ = [
+    "EXPLICIT_REQUEST_ATTR",
+    "ORIGINAL_REQUEST_STATE_ATTR",
+    "bind_generation_request_raw",
+    "bind_run_config_raw",
+    "bind_serve_config_raw",
+    "refresh_generation_request_raw",
+]
diff --git a/fastvideo/entrypoints/cli/bench_serving.py b/fastvideo/entrypoints/cli/bench_serving.py
index 4e57beca61..28f6d88db9 100644
--- a/fastvideo/entrypoints/cli/bench_serving.py
+++ b/fastvideo/entrypoints/cli/bench_serving.py
@@ -7,7 +7,7 @@
     # launch a server and benchmark on it
 
     # T2V or T2I or any other multimodal generation model
-    fastvideo serve --model-path Wan-AI/Wan2.1-T2V-1.3B-Diffusers --port 8000
+    fastvideo serve --config serve.yaml
 
     # benchmark it and make sure the port is the same as the server's port
     fastvideo bench --dataset vbench --num-prompts 20 --port 8000
diff --git a/fastvideo/entrypoints/cli/generate.py b/fastvideo/entrypoints/cli/generate.py
index fc2fe0a0a2..5adc3d0343 100644
--- a/fastvideo/entrypoints/cli/generate.py
+++ b/fastvideo/entrypoints/cli/generate.py
@@ -2,19 +2,17 @@
 # adapted from vllm: https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/cli/serve.py
 
 import argparse
-import dataclasses
 import os
 from typing import cast
 
 from fastvideo import VideoGenerator
-from fastvideo.configs.sample.base import SamplingParam
 from fastvideo.entrypoints.cli.cli_types import CLISubcommand
-from fastvideo.entrypoints.cli.utils import RaiseNotImplementedAction
-from fastvideo.fastvideo_args import FastVideoArgs
+from fastvideo.entrypoints.cli.inference_config import build_generate_run_config
 from fastvideo.logger import init_logger
 from fastvideo.utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
+_VALIDATED_RUN_CONFIG_ATTR = "_fastvideo_validated_run_config"
 
 
 class GenerateSubcommand(CLISubcommand):
@@ -23,89 +21,47 @@ class GenerateSubcommand(CLISubcommand):
     def __init__(self) -> None:
         self.name = "generate"
         super().__init__()
-        self.init_arg_names = self._get_init_arg_names()
-        self.generation_arg_names = self._get_generation_arg_names()
-
-    def _get_init_arg_names(self) -> list[str]:
-        """Get names of arguments for VideoGenerator initialization"""
-        return ["num_gpus", "tp_size", "sp_size", "model_path"]
-
-    def _get_generation_arg_names(self) -> list[str]:
-        """Get names of arguments for generate_video method"""
-        return [field.name for field in dataclasses.fields(SamplingParam)]
 
     def cmd(self, args: argparse.Namespace) -> None:
-        excluded_args = ['subparser', 'config', 'dispatch_function']
-
-        provided_args = {}
-        for k, v in vars(args).items():
-            if (k not in excluded_args and v is not None and hasattr(args, '_provided') and k in args._provided):
-                provided_args[k] = v
-
-        if 'model_path' in vars(args) and args.model_path is not None:
-            provided_args['model_path'] = args.model_path
-
-        if 'prompt' in vars(args) and args.prompt is not None:
-            provided_args['prompt'] = args.prompt
-
-        merged_args = {**provided_args}
-
-        logger.info('CLI Args: %s', merged_args)
-
-        if 'model_path' not in merged_args or not merged_args['model_path']:
-            raise ValueError("model_path must be provided either in config file or via --model-path")
-
-        # Check if either prompt or prompt_txt is provided
-        has_prompt = 'prompt' in merged_args and merged_args['prompt']
-        has_prompt_txt = 'prompt_txt' in merged_args and merged_args['prompt_txt']
-
-        if not (has_prompt or has_prompt_txt):
-            raise ValueError("Either prompt or prompt_txt must be provided")
+        run_config = getattr(args, _VALIDATED_RUN_CONFIG_ATTR, None)
+        if run_config is None:
+            run_config = build_generate_run_config(
+                args,
+                overrides=getattr(args, "_unknown", None),
+            )
+        logger.info("CLI generate config: %s", run_config)
 
-        if has_prompt and has_prompt_txt:
-            raise ValueError("Cannot provide both 'prompt' and 'prompt_txt'. Use only one of them.")
-
-        init_args = {k: v for k, v in merged_args.items() if k not in self.generation_arg_names}
-        generation_args = {k: v for k, v in merged_args.items() if k in self.generation_arg_names}
-        generation_args.setdefault("return_frames", False)
-
-        model_path = init_args.pop('model_path')
-        prompt = generation_args.pop('prompt', None)
-
-        generator = VideoGenerator.from_pretrained(model_path=model_path, **init_args)
-
-        # Call generate_video - it handles both single and batch modes
-        generator.generate_video(prompt=prompt, **generation_args)
+        generator = VideoGenerator.from_config(run_config.generator)
+        generator.generate(run_config.request)
 
     def validate(self, args: argparse.Namespace) -> None:
         """Validate the arguments for this command"""
-        if args.num_gpus is not None and args.num_gpus <= 0:
-            raise ValueError("Number of gpus must be positive")
-
-        if args.config and not os.path.exists(args.config):
+        if not args.config:
+            raise ValueError("fastvideo generate requires --config PATH; use a nested "
+                             "run config plus optional dotted overrides")
+        if not os.path.exists(args.config):
             raise ValueError(f"Config file not found: {args.config}")
+        setattr(
+            args,
+            _VALIDATED_RUN_CONFIG_ATTR,
+            build_generate_run_config(
+                args,
+                overrides=getattr(args, "_unknown", None),
+            ),
+        )
 
     def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
         generate_parser = subparsers.add_parser(
             "generate",
             help="Run inference on a model",
-            usage="fastvideo generate (--model-path MODEL_PATH_OR_ID --prompt PROMPT) | --config CONFIG_FILE [OPTIONS]")
+            usage="fastvideo generate --config RUN_CONFIG [--dotted.override VALUE]")
 
         generate_parser.add_argument(
             "--config",
             type=str,
             default='',
             required=False,
-            help="Read CLI options from a config JSON or YAML file. If provided, --model-path and --prompt are optional."
-        )
-
-        generate_parser = FastVideoArgs.add_cli_args(generate_parser)
-        generate_parser = SamplingParam.add_cli_args(generate_parser)
-
-        generate_parser.add_argument(
-            "--text-encoder-configs",
-            action=RaiseNotImplementedAction,
-            help="JSON array of text encoder configurations (NOT YET IMPLEMENTED)",
+            help="Path to a nested run config JSON or YAML file. Required.",
         )
 
         return cast(FlexibleArgumentParser, generate_parser)
diff --git a/fastvideo/entrypoints/cli/inference_config.py b/fastvideo/entrypoints/cli/inference_config.py
new file mode 100644
index 0000000000..8982ab013c
--- /dev/null
+++ b/fastvideo/entrypoints/cli/inference_config.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import argparse
+from collections.abc import Mapping
+from copy import deepcopy
+from typing import Any
+
+from fastvideo.api.overrides import apply_overrides, parse_cli_overrides
+from fastvideo.api.parser import load_raw_config, parse_config
+from fastvideo.api.schema import RunConfig, ServeConfig
+
+_GENERATE_OVERRIDE_PREFIXES = ("generator.", "request.")
+_SERVE_OVERRIDE_PREFIXES = (
+    "generator.",
+    "server.",
+    "default_request.",
+)
+
+
+def build_generate_run_config(
+    args: argparse.Namespace,
+    overrides: list[str] | None = None,
+) -> RunConfig:
+    raw = _load_nested_config(getattr(args, "config", None))
+    raw.setdefault("request", {})
+    raw = _apply_dotted_overrides(
+        raw,
+        overrides,
+        allowed_prefixes=_GENERATE_OVERRIDE_PREFIXES,
+    )
+    _ensure_generate_cli_defaults(raw)
+    config = parse_config(RunConfig, raw)
+    _validate_num_gpus(config.generator.engine.num_gpus)
+    _validate_generate_prompt_sources(config)
+    return config
+
+
+def build_serve_config(
+    args: argparse.Namespace,
+    overrides: list[str] | None = None,
+) -> ServeConfig:
+    raw = _load_nested_config(getattr(args, "config", None))
+    raw.setdefault("server", {})
+    raw.setdefault("default_request", {})
+    raw = _apply_dotted_overrides(
+        raw,
+        overrides,
+        allowed_prefixes=_SERVE_OVERRIDE_PREFIXES,
+    )
+    config = parse_config(ServeConfig, raw)
+    _validate_num_gpus(config.generator.engine.num_gpus)
+    return config
+
+
+def _load_nested_config(path: str | None) -> dict[str, Any]:
+    if not path:
+        raise ValueError("Inference CLI requires --config PATH; use a nested config file "
+                         "plus optional dotted overrides")
+
+    raw = load_raw_config(path)
+    if not isinstance(raw.get("generator"), Mapping):
+        raise ValueError("Inference config must use the nested schema with a top-level "
+                         "'generator' mapping")
+    return deepcopy(dict(raw))
+
+
+def _apply_dotted_overrides(
+    raw: Mapping[str, Any],
+    overrides: list[str] | None,
+    *,
+    allowed_prefixes: tuple[str, ...],
+) -> dict[str, Any]:
+    if not overrides:
+        return deepcopy(dict(raw))
+
+    parsed = parse_cli_overrides(overrides)
+    for key in parsed:
+        if "." not in key:
+            raise ValueError("CLI overrides must use dotted config paths like "
+                             "--request.sampling.seed 42")
+        if not key.startswith(allowed_prefixes):
+            allowed = ", ".join(allowed_prefixes)
+            raise ValueError(f"Unsupported override path {key!r}. Allowed prefixes: {allowed}")
+    return apply_overrides(raw, parsed)
+
+
+def _ensure_generate_cli_defaults(raw: dict[str, Any]) -> None:
+    request = raw.setdefault("request", {})
+    output = request.setdefault("output", {})
+    output.setdefault("return_frames", False)
+
+
+def _validate_generate_prompt_sources(config: RunConfig) -> None:
+    has_prompt = config.request.prompt is not None
+    has_prompt_path = config.request.inputs.prompt_path is not None
+    if not (has_prompt or has_prompt_path):
+        raise ValueError("Either request.prompt or request.inputs.prompt_path must be provided")
+    if has_prompt and has_prompt_path:
+        raise ValueError("Cannot provide both request.prompt and request.inputs.prompt_path")
+
+
+def _validate_num_gpus(num_gpus: int) -> None:
+    if num_gpus <= 0:
+        raise ValueError(f"generator.engine.num_gpus must be > 0; got {num_gpus}")
+
+
+__all__ = [
+    "build_generate_run_config",
+    "build_serve_config",
+]
diff --git a/fastvideo/entrypoints/cli/main.py b/fastvideo/entrypoints/cli/main.py
index bd8f29d4e2..3d753089d4 100644
--- a/fastvideo/entrypoints/cli/main.py
+++ b/fastvideo/entrypoints/cli/main.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # adapted from vllm: https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/cli/main.py
-
 from fastvideo.entrypoints.cli.cli_types import CLISubcommand
 from fastvideo.entrypoints.cli.generate import cmd_init as generate_cmd_init
 from fastvideo.utils import FlexibleArgumentParser
@@ -27,14 +26,17 @@ def main() -> None:
     for cmd in cmd_init():
         cmd.subparser_init(subparsers).set_defaults(dispatch_function=cmd.cmd)
         cmds[cmd.name] = cmd
-    args = parser.parse_args()
+
+    args, unknown = parser.parse_known_args()
+    if unknown and args.subparser not in {"generate", "serve"}:
+        parser.error(f"unrecognized arguments: {' '.join(unknown)}")
+    args._unknown = unknown
     if args.subparser in cmds:
         cmds[args.subparser].validate(args)
-
-    if hasattr(args, "dispatch_function"):
         args.dispatch_function(args)
-    else:
-        parser.print_help()
+        return
+
+    parser.print_help()
 
 
 if __name__ == "__main__":
diff --git a/fastvideo/entrypoints/cli/serve.py b/fastvideo/entrypoints/cli/serve.py
index fdee6a7795..0e52fa572b 100644
--- a/fastvideo/entrypoints/cli/serve.py
+++ b/fastvideo/entrypoints/cli/serve.py
@@ -2,14 +2,18 @@
 # adapted from vllm: https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/entrypoints/cli/serve.py
 
 import argparse
+import os
 from typing import cast
 
+from fastvideo.api.compat import generator_config_to_fastvideo_args
+from fastvideo.api.request_metadata import EXPLICIT_REQUEST_ATTR
 from fastvideo.entrypoints.cli.cli_types import CLISubcommand
-from fastvideo.fastvideo_args import FastVideoArgs
+from fastvideo.entrypoints.cli.inference_config import build_serve_config
 from fastvideo.logger import init_logger
 from fastvideo.utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
+_VALIDATED_SERVE_CONFIG_ATTR = "_fastvideo_validated_serve_config"
 
 
 class ServeSubcommand(CLISubcommand):
@@ -20,94 +24,67 @@ def __init__(self) -> None:
         super().__init__()
 
     def cmd(self, args: argparse.Namespace) -> None:
-        excluded_args = {
-            "subparser",
-            "config",
-            "dispatch_function",
-            "host",
-            "port",
-            "output_dir",
-        }
-
-        provided: set[str] = getattr(args, '_provided', set())
-        cli_kwargs = {}
-        for k, v in vars(args).items():
-            if k in excluded_args:
-                continue
-            if k == '_provided':
-                continue
-            if k in provided and v is not None:
-                cli_kwargs[k] = v
-
-        if 'model_path' not in cli_kwargs and args.model_path is not None:
-            cli_kwargs['model_path'] = args.model_path
-
-        if not cli_kwargs.get('model_path'):
-            raise ValueError("model_path must be provided via --model-path")
-
-        from fastvideo.entrypoints.openai.api_server import (
-            DEFAULT_HOST,
-            DEFAULT_OUTPUT_DIR,
-            DEFAULT_PORT,
-            run_server,
+        serve_config = getattr(args, _VALIDATED_SERVE_CONFIG_ATTR, None)
+        if serve_config is None:
+            serve_config = build_serve_config(
+                args,
+                overrides=getattr(args, "_unknown", None),
+            )
+        explicit_raw = getattr(
+            serve_config.default_request,
+            EXPLICIT_REQUEST_ATTR,
+            None,
         )
+        if explicit_raw:
+            raise NotImplementedError("ServeConfig.default_request is not wired into the OpenAI "
+                                      "server yet")
 
-        host = getattr(args, "host", DEFAULT_HOST)
-        port = getattr(args, "port", DEFAULT_PORT)
-        output_dir = getattr(args, "output_dir", DEFAULT_OUTPUT_DIR)
+        from fastvideo.entrypoints.openai.api_server import (
+            run_server, )
 
-        logger.info("CLI serve args: %s", cli_kwargs)
-        logger.info("Server will listen on %s:%d", host, port)
+        logger.info("CLI serve config: %s", serve_config)
+        logger.info(
+            "Server will listen on %s:%d",
+            serve_config.server.host,
+            serve_config.server.port,
+        )
 
-        fastvideo_args = FastVideoArgs.from_kwargs(**cli_kwargs)
-        run_server(fastvideo_args, host=host, port=port, output_dir=output_dir)
+        fastvideo_args = generator_config_to_fastvideo_args(serve_config.generator)
+        run_server(
+            fastvideo_args,
+            host=serve_config.server.host,
+            port=serve_config.server.port,
+            output_dir=serve_config.server.output_dir,
+        )
 
     def validate(self, args: argparse.Namespace) -> None:
-        if args.num_gpus is not None and args.num_gpus <= 0:
-            raise ValueError("Number of gpus must be positive")
-
-    def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
-        from fastvideo.entrypoints.openai.api_server import (
-            DEFAULT_HOST,
-            DEFAULT_OUTPUT_DIR,
-            DEFAULT_PORT,
+        if not args.config:
+            raise ValueError("fastvideo serve requires --config PATH; use a nested "
+                             "serve config plus optional dotted overrides")
+        if not os.path.exists(args.config):
+            raise ValueError(f"Config file not found: {args.config}")
+        setattr(
+            args,
+            _VALIDATED_SERVE_CONFIG_ATTR,
+            build_serve_config(
+                args,
+                overrides=getattr(args, "_unknown", None),
+            ),
         )
 
+    def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
         serve_parser = subparsers.add_parser(
             "serve",
             help="Start an OpenAI-compatible HTTP server",
-            usage=("fastvideo serve --model-path MODEL_PATH_OR_ID "
-                   "[--host HOST] [--port PORT] [OPTIONS]"),
-        )
-
-        serve_parser.add_argument(
-            "--host",
-            type=str,
-            default=DEFAULT_HOST,
-            help=f"Host to bind the server to (default: {DEFAULT_HOST})",
-        )
-        serve_parser.add_argument(
-            "--port",
-            type=int,
-            default=DEFAULT_PORT,
-            help=f"Port to listen on (default: {DEFAULT_PORT})",
-        )
-        serve_parser.add_argument(
-            "--output-dir",
-            type=str,
-            default=DEFAULT_OUTPUT_DIR,
-            help=("Directory for generated outputs "
-                  f"(default: {DEFAULT_OUTPUT_DIR})"),
+            usage="fastvideo serve --config SERVE_CONFIG [--dotted.override VALUE]",
         )
         serve_parser.add_argument(
             "--config",
             type=str,
             default="",
             required=False,
-            help="Read CLI options from a config JSON or YAML file.",
+            help="Path to a nested config JSON or YAML file. Required.",
         )
-
-        serve_parser = FastVideoArgs.add_cli_args(serve_parser)
         return cast(FlexibleArgumentParser, serve_parser)
 
 
diff --git a/fastvideo/tests/api/test_cli_translation.py b/fastvideo/tests/api/test_cli_translation.py
new file mode 100644
index 0000000000..1aec790972
--- /dev/null
+++ b/fastvideo/tests/api/test_cli_translation.py
@@ -0,0 +1,502 @@
+from __future__ import annotations
+
+import sys
+from types import SimpleNamespace
+
+import pytest
+
+from fastvideo.api.compat import request_to_sampling_param
+from fastvideo.entrypoints.cli import main as cli_main
+from fastvideo.entrypoints.cli.generate import GenerateSubcommand
+from fastvideo.entrypoints.cli.inference_config import (
+    build_generate_run_config,
+    build_serve_config,
+)
+from fastvideo.configs.sample import SamplingParam
+from fastvideo.entrypoints.cli.serve import ServeSubcommand
+from fastvideo.entrypoints.openai import api_server
+from fastvideo.entrypoints.video_generator import VideoGenerator
+from fastvideo.utils import FlexibleArgumentParser
+
+
+def _parse_generate_args(argv: list[str]):
+    parser = FlexibleArgumentParser()
+    subparsers = parser.add_subparsers(dest="subparser")
+    GenerateSubcommand().subparser_init(subparsers)
+    args, unknown = parser.parse_known_args(["generate", *argv])
+    args._unknown = unknown
+    return args, unknown
+
+
+def _parse_serve_args(argv: list[str]):
+    parser = FlexibleArgumentParser()
+    subparsers = parser.add_subparsers(dest="subparser")
+    ServeSubcommand().subparser_init(subparsers)
+    args, unknown = parser.parse_known_args(["serve", *argv])
+    args._unknown = unknown
+    return args, unknown
+
+
+def test_generate_parser_preserves_unknown_dotted_overrides(tmp_path):
+    config_path = tmp_path / "run.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n"
+        "request:\n"
+        "  prompt: hello\n",
+        encoding="utf-8",
+    )
+
+    args, unknown = _parse_generate_args([
+        "--config",
+        str(config_path),
+        "--request.sampling.seed",
+        "42",
+    ])
+
+    assert args.config == str(config_path)
+    assert unknown == ["--request.sampling.seed", "42"]
+
+
+def test_build_generate_run_config_loads_nested_config_and_dotted_overrides(
+    tmp_path,
+):
+    config_path = tmp_path / "run.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n"
+        "  engine:\n"
+        "    num_gpus: 1\n"
+        "request:\n"
+        "  prompt: hello\n"
+        "  output:\n"
+        "    return_frames: true\n",
+        encoding="utf-8",
+    )
+
+    args, unknown = _parse_generate_args([
+        "--config",
+        str(config_path),
+        "--generator.engine.num_gpus",
+        "2",
+        "--request.sampling.seed",
+        "7",
+    ])
+
+    config = build_generate_run_config(args, unknown)
+
+    assert config.generator.model_path == "test-model"
+    assert config.generator.engine.num_gpus == 2
+    assert config.request.prompt == "hello"
+    assert config.request.sampling.seed == 7
+    assert config.request.output.return_frames is True
+
+
+def test_build_generate_run_config_accepts_dashed_dotted_overrides(tmp_path):
+    config_path = tmp_path / "run.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n"
+        "request:\n"
+        "  prompt: hello\n",
+        encoding="utf-8",
+    )
+
+    args, unknown = _parse_generate_args([
+        "--config",
+        str(config_path),
+        "--generator.engine.num-gpus",
+        "2",
+        "--request.output.output-path",
+        "outputs/dashed",
+    ])
+
+    config = build_generate_run_config(args, unknown)
+
+    assert config.generator.engine.num_gpus == 2
+    assert config.request.output.output_path == "outputs/dashed"
+
+
+def test_build_generate_run_config_loads_nested_json_config(tmp_path):
+    config_path = tmp_path / "run.json"
+    config_path.write_text(
+        '{"generator":{"model_path":"json-model"},'
+        '"request":{"prompt":"hello"}}',
+        encoding="utf-8",
+    )
+
+    args, unknown = _parse_generate_args(["--config", str(config_path)])
+    config = build_generate_run_config(args, unknown)
+
+    assert config.generator.model_path == "json-model"
+    assert config.request.prompt == "hello"
+    assert config.request.output.return_frames is False
+
+
+def test_build_generate_run_config_preserves_model_defaults_for_omitted_request_fields(
+    tmp_path,
+    monkeypatch,
+):
+    config_path = tmp_path / "run.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n"
+        "request:\n"
+        "  prompt: hello\n",
+        encoding="utf-8",
+    )
+
+    def fake_from_pretrained(cls, model_path):
+        return cls(
+            num_frames=81,
+            height=480,
+            width=832,
+            fps=16,
+            guidance_scale=3.0,
+            negative_prompt="model default",
+        )
+
+    monkeypatch.setattr(
+        SamplingParam,
+        "from_pretrained",
+        classmethod(fake_from_pretrained),
+    )
+
+    args, unknown = _parse_generate_args(["--config", str(config_path)])
+    config = build_generate_run_config(args, unknown)
+    sampling_param = request_to_sampling_param(
+        config.request,
+        model_path=config.generator.model_path,
+    )
+
+    assert sampling_param.num_frames == 81
+    assert sampling_param.height == 480
+    assert sampling_param.width == 832
+    assert sampling_param.fps == 16
+    assert sampling_param.guidance_scale == 3.0
+    assert sampling_param.negative_prompt == "model default"
+
+
+def test_build_generate_run_config_rejects_flat_config(tmp_path):
+    config_path = tmp_path / "run-flat.yaml"
+    config_path.write_text(
+        "model_path: flat-model\n"
+        "prompt: hello\n",
+        encoding="utf-8",
+    )
+
+    args, unknown = _parse_generate_args(["--config", str(config_path)])
+    with pytest.raises(
+        ValueError,
+        match="top-level 'generator' mapping",
+    ):
+        build_generate_run_config(args, unknown)
+
+
+def test_build_generate_run_config_rejects_non_dotted_overrides(tmp_path):
+    config_path = tmp_path / "run.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n"
+        "request:\n"
+        "  prompt: hello\n",
+        encoding="utf-8",
+    )
+
+    args, unknown = _parse_generate_args([
+        "--config",
+        str(config_path),
+        "--num-gpus",
+        "2",
+    ])
+    with pytest.raises(
+        ValueError,
+        match="CLI overrides must use dotted config paths",
+    ):
+        build_generate_run_config(args, unknown)
+
+
+def test_build_generate_run_config_requires_single_prompt_source(tmp_path):
+    missing_prompt_path = tmp_path / "missing.yaml"
+    missing_prompt_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n",
+        encoding="utf-8",
+    )
+    args, unknown = _parse_generate_args(["--config", str(missing_prompt_path)])
+    with pytest.raises(
+        ValueError,
+        match="Either request.prompt or request.inputs.prompt_path must be provided",
+    ):
+        build_generate_run_config(args, unknown)
+
+    conflicting_prompt_path = tmp_path / "conflict.yaml"
+    conflicting_prompt_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n"
+        "request:\n"
+        "  prompt: hello\n"
+        "  inputs:\n"
+        "    prompt_path: prompts.txt\n",
+        encoding="utf-8",
+    )
+    args, unknown = _parse_generate_args(["--config", str(conflicting_prompt_path)])
+    with pytest.raises(
+        ValueError,
+        match="Cannot provide both request.prompt and request.inputs.prompt_path",
+    ):
+        build_generate_run_config(args, unknown)
+
+
+def test_build_serve_config_loads_nested_config_and_dotted_overrides(tmp_path):
+    config_path = tmp_path / "serve.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: serve-model\n"
+        "server:\n"
+        "  host: 0.0.0.0\n"
+        "  port: 8000\n",
+        encoding="utf-8",
+    )
+
+    args, unknown = _parse_serve_args([
+        "--config",
+        str(config_path),
+        "--generator.engine.num_gpus",
+        "3",
+        "--server.port",
+        "9100",
+    ])
+
+    config = build_serve_config(args, unknown)
+
+    assert config.generator.model_path == "serve-model"
+    assert config.generator.engine.num_gpus == 3
+    assert config.server.host == "0.0.0.0"
+    assert config.server.port == 9100
+
+
+def test_build_serve_config_rejects_flat_config(tmp_path):
+    config_path = tmp_path / "serve-flat.yaml"
+    config_path.write_text(
+        "model_path: serve-model\n"
+        "host: 127.0.0.1\n",
+        encoding="utf-8",
+    )
+
+    args, unknown = _parse_serve_args(["--config", str(config_path)])
+    with pytest.raises(
+        ValueError,
+        match="top-level 'generator' mapping",
+    ):
+        build_serve_config(args, unknown)
+
+
+def test_build_serve_config_rejects_non_dotted_overrides(tmp_path):
+    config_path = tmp_path / "serve.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: serve-model\n",
+        encoding="utf-8",
+    )
+
+    args, unknown = _parse_serve_args([
+        "--config",
+        str(config_path),
+        "--port",
+        "9000",
+    ])
+    with pytest.raises(
+        ValueError,
+        match="CLI overrides must use dotted config paths",
+    ):
+        build_serve_config(args, unknown)
+
+
+def test_generate_subcommand_requires_config():
+    args, _ = _parse_generate_args([])
+
+    with pytest.raises(
+        ValueError,
+        match="fastvideo generate requires --config PATH",
+    ):
+        GenerateSubcommand().validate(args)
+
+
+def test_serve_subcommand_requires_config():
+    args, _ = _parse_serve_args([])
+
+    with pytest.raises(
+        ValueError,
+        match="fastvideo serve requires --config PATH",
+    ):
+        ServeSubcommand().validate(args)
+
+
+def test_generate_subcommand_rejects_non_positive_num_gpus(tmp_path):
+    config_path = tmp_path / "run.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n"
+        "request:\n"
+        "  prompt: hello world\n",
+        encoding="utf-8",
+    )
+    args, _ = _parse_generate_args([
+        "--config",
+        str(config_path),
+        "--generator.engine.num_gpus",
+        "0",
+    ])
+
+    with pytest.raises(
+        ValueError,
+        match=r"generator\.engine\.num_gpus must be > 0; got 0",
+    ):
+        GenerateSubcommand().validate(args)
+
+
+def test_serve_subcommand_rejects_non_positive_num_gpus(tmp_path):
+    config_path = tmp_path / "serve.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: serve-model\n",
+        encoding="utf-8",
+    )
+    args, _ = _parse_serve_args([
+        "--config",
+        str(config_path),
+        "--generator.engine.num_gpus",
+        "0",
+    ])
+
+    with pytest.raises(
+        ValueError,
+        match=r"generator\.engine\.num_gpus must be > 0; got 0",
+    ):
+        ServeSubcommand().validate(args)
+
+
+def test_generate_subcommand_dispatches_via_typed_config(tmp_path, monkeypatch):
+    config_path = tmp_path / "run.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n"
+        "request:\n"
+        "  prompt: hello world\n",
+        encoding="utf-8",
+    )
+    args, _ = _parse_generate_args([
+        "--config",
+        str(config_path),
+        "--request.sampling.num_frames",
+        "81",
+    ])
+    captured: dict[str, object] = {}
+
+    class FakeGenerator:
+
+        def generate(self, request):
+            captured["request"] = request
+            return None
+
+    def fake_from_config(cls, config):
+        captured["config"] = config
+        return FakeGenerator()
+
+    monkeypatch.setattr(
+        VideoGenerator,
+        "from_config",
+        classmethod(fake_from_config),
+    )
+
+    GenerateSubcommand().cmd(args)
+
+    request = captured["request"]
+    assert captured["config"].model_path == "test-model"
+    assert request.prompt == "hello world"
+    assert request.sampling.num_frames == 81
+    assert request.output.return_frames is False
+
+
+def test_serve_subcommand_dispatches_via_typed_config(tmp_path, monkeypatch):
+    config_path = tmp_path / "serve.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: serve-model\n",
+        encoding="utf-8",
+    )
+    args, _ = _parse_serve_args([
+        "--config",
+        str(config_path),
+        "--server.host",
+        "127.0.0.1",
+        "--server.port",
+        "9000",
+        "--server.output_dir",
+        "serve-outputs/",
+        "--generator.engine.num_gpus",
+        "2",
+    ])
+    captured: dict[str, object] = {}
+
+    def fake_generator_config_to_fastvideo_args(config):
+        captured["config"] = config
+        return SimpleNamespace(model_path=config.model_path)
+
+    def fake_run_server(fastvideo_args, host, port, output_dir):
+        captured["fastvideo_args"] = fastvideo_args
+        captured["host"] = host
+        captured["port"] = port
+        captured["output_dir"] = output_dir
+
+    monkeypatch.setattr(
+        "fastvideo.entrypoints.cli.serve.generator_config_to_fastvideo_args",
+        fake_generator_config_to_fastvideo_args,
+    )
+    monkeypatch.setattr(api_server, "run_server", fake_run_server)
+
+    ServeSubcommand().cmd(args)
+
+    assert captured["config"].model_path == "serve-model"
+    assert captured["config"].engine.num_gpus == 2
+    assert captured["host"] == "127.0.0.1"
+    assert captured["port"] == 9000
+    assert captured["output_dir"] == "serve-outputs/"
+
+
+def test_serve_subcommand_rejects_non_default_default_request(tmp_path):
+    config_path = tmp_path / "serve-default-request.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: serve-model\n"
+        "default_request:\n"
+        "  prompt: hello\n",
+        encoding="utf-8",
+    )
+    args, _ = _parse_serve_args(["--config", str(config_path)])
+
+    with pytest.raises(
+        NotImplementedError,
+        match="ServeConfig.default_request is not wired",
+    ):
+        ServeSubcommand().cmd(args)
+
+
+def test_main_rejects_top_level_config_without_subcommand(tmp_path, monkeypatch):
+    config_path = tmp_path / "run.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n"
+        "request:\n"
+        "  prompt: hello\n",
+        encoding="utf-8",
+    )
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["fastvideo", "--config", str(config_path)],
+    )
+
+    with pytest.raises(SystemExit):
+        cli_main.main()
diff --git a/fastvideo/tests/api/test_overrides.py b/fastvideo/tests/api/test_overrides.py
index 824d4ff36e..ddffd504de 100644
--- a/fastvideo/tests/api/test_overrides.py
+++ b/fastvideo/tests/api/test_overrides.py
@@ -36,6 +36,20 @@ def test_parse_cli_overrides_casts_supported_scalar_and_collection_types() -> No
     }
 
 
+def test_parse_cli_overrides_normalizes_dashed_dotted_keys() -> None:
+    parsed = parse_cli_overrides([
+        "--generator.engine.num-gpus",
+        "2",
+        "--request.output.output-path",
+        "outputs/custom.mp4",
+    ])
+
+    assert parsed == {
+        "generator.engine.num_gpus": 2,
+        "request.output.output_path": "outputs/custom.mp4",
+    }
+
+
 def test_apply_overrides_merges_nested_dicts_without_mutating_source() -> None:
     original = {
         "generator": {
diff --git a/fastvideo/tests/entrypoints/test_video_generator.py b/fastvideo/tests/entrypoints/test_video_generator.py
index c0f79580b1..5f4fcf3c59 100644
--- a/fastvideo/tests/entrypoints/test_video_generator.py
+++ b/fastvideo/tests/entrypoints/test_video_generator.py
@@ -10,6 +10,7 @@
     GeneratorConfig,
     InputConfig,
     SamplingConfig,
+    load_run_config,
 )
 from fastvideo.configs.sample import SamplingParam
 from fastvideo.entrypoints.video_generator import VideoGenerator
@@ -293,6 +294,139 @@ def fake_generate_video_impl(prompt=None, sampling_param=None, **kwargs):
     assert captured["sampling_param"].width == 1280
 
 
+def test_generate_mapping_request_preserves_model_defaults_for_omitted_fields(
+    monkeypatch,
+):
+    generator = _new_runtime_video_generator()
+    captured = {}
+
+    def fake_from_pretrained(cls, model_path):
+        return cls(
+            negative_prompt="model default",
+            num_frames=61,
+            height=448,
+            width=832,
+            fps=16,
+            guidance_scale=3.0,
+        )
+
+    def fake_generate_video_impl(prompt=None, sampling_param=None, **kwargs):
+        captured["sampling_param"] = sampling_param
+        return {"prompts": prompt, "video_path": "outputs/test.mp4"}
+
+    monkeypatch.setattr(SamplingParam, "from_pretrained", classmethod(fake_from_pretrained))
+    monkeypatch.setattr(generator, "_generate_video_impl", fake_generate_video_impl)
+
+    generator.generate(
+        {
+            "prompt": "hello world",
+        }
+    )
+
+    assert captured["sampling_param"].negative_prompt == "model default"
+    assert captured["sampling_param"].num_frames == 61
+    assert captured["sampling_param"].height == 448
+    assert captured["sampling_param"].width == 832
+    assert captured["sampling_param"].fps == 16
+    assert captured["sampling_param"].guidance_scale == 3.0
+
+
+def test_generate_honors_post_load_request_mutations(monkeypatch, tmp_path):
+    generator = _new_runtime_video_generator()
+    captured = {}
+    config_path = tmp_path / "run.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n"
+        "request:\n"
+        "  prompt: hello world\n",
+        encoding="utf-8",
+    )
+
+    def fake_from_pretrained(cls, model_path):
+        return cls(seed=1024, num_frames=61, height=448, width=832)
+
+    def fake_generate_video_impl(prompt=None, sampling_param=None, **kwargs):
+        captured["sampling_param"] = sampling_param
+        return {"prompts": prompt, "video_path": "outputs/test.mp4"}
+
+    monkeypatch.setattr(SamplingParam, "from_pretrained", classmethod(fake_from_pretrained))
+    monkeypatch.setattr(generator, "_generate_video_impl", fake_generate_video_impl)
+
+    config = load_run_config(config_path)
+    config.request.sampling.seed = 7
+
+    generator.generate(config.request)
+
+    assert captured["sampling_param"].seed == 7
+
+
+def test_generate_honors_post_load_mutations_matching_schema_defaults(
+    monkeypatch,
+    tmp_path,
+):
+    generator = _new_runtime_video_generator()
+    captured = {}
+    config_path = tmp_path / "run.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n"
+        "request:\n"
+        "  prompt: hello world\n",
+        encoding="utf-8",
+    )
+
+    def fake_from_pretrained(cls, model_path):
+        return cls(guidance_scale=3.0)
+
+    def fake_generate_video_impl(prompt=None, sampling_param=None, **kwargs):
+        captured["sampling_param"] = sampling_param
+        return {"prompts": prompt, "video_path": "outputs/test.mp4"}
+
+    monkeypatch.setattr(SamplingParam, "from_pretrained", classmethod(fake_from_pretrained))
+    monkeypatch.setattr(generator, "_generate_video_impl", fake_generate_video_impl)
+
+    config = load_run_config(config_path)
+    config.request.sampling.guidance_scale = 1.0
+
+    generator.generate(config.request)
+
+    assert captured["sampling_param"].guidance_scale == 1.0
+
+
+def test_generate_removes_deleted_loaded_stage_overrides(monkeypatch, tmp_path):
+    generator = _new_runtime_video_generator()
+    captured = {}
+    config_path = tmp_path / "run.yaml"
+    config_path.write_text(
+        "generator:\n"
+        "  model_path: test-model\n"
+        "request:\n"
+        "  prompt: hello world\n"
+        "  stage_overrides:\n"
+        "    refine:\n"
+        "      t_thresh: 0.8\n",
+        encoding="utf-8",
+    )
+
+    def fake_from_pretrained(cls, model_path):
+        return cls(t_thresh=0.5)
+
+    def fake_generate_video_impl(prompt=None, sampling_param=None, **kwargs):
+        captured["sampling_param"] = sampling_param
+        return {"prompts": prompt, "video_path": "outputs/test.mp4"}
+
+    monkeypatch.setattr(SamplingParam, "from_pretrained", classmethod(fake_from_pretrained))
+    monkeypatch.setattr(generator, "_generate_video_impl", fake_generate_video_impl)
+
+    config = load_run_config(config_path)
+    del config.request.stage_overrides["refine"]
+
+    generator.generate(config.request)
+
+    assert captured["sampling_param"].t_thresh == 0.5
+
+
 def test_generate_video_legacy_call_uses_legacy_impl(monkeypatch):
     generator = _new_runtime_video_generator()
     captured = {}
diff --git a/fastvideo/tests/inference/bsa/test_bsa_inference.py b/fastvideo/tests/inference/bsa/test_bsa_inference.py
index d684eec191..e52842d8ba 100644
--- a/fastvideo/tests/inference/bsa/test_bsa_inference.py
+++ b/fastvideo/tests/inference/bsa/test_bsa_inference.py
@@ -1,60 +1,98 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 import os
 import subprocess
 import sys
+import tempfile
 from pathlib import Path
 
+
 def test_inference_bsa():
     """Test FastVideo BSA_ATTN inference pipeline"""
 
-    num_gpus = "1"
-    model_base = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
     output_dir = Path("outputs_video/bsa_1.3B/")
 
     os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "BSA_ATTN"
 
-    cmd = [
-        sys.executable,
-        "-m",
-        "fastvideo.entrypoints.cli.main",
-        "generate",
-        "--model-path", model_base,
-        "--sp-size", num_gpus,
-        "--tp-size", "1",
-        "--num-gpus", num_gpus,
-        "--dit-cpu-offload", "False",
-        "--vae-cpu-offload", "False",
-        "--text-encoder-cpu-offload", "True",
-        "--pin-cpu-memory", "False",
-        "--height", "480",
-        "--width", "832",
-        "--num-frames", "77",
-        "--num-inference-steps", "10",
-        "--fps", "16",
-        "--guidance-scale", "6.0",
-        "--flow-shift", "8.0",
-        "--prompt", "A majestic lion strides across the golden savanna, its powerful frame glistening under the warm afternoon sun. The tall grass ripples gently in the breeze, enhancing the lion's commanding presence. The tone is vibrant, embodying the raw energy of the wild. Low angle, steady tracking shot, cinematic.",
-        "--negative-prompt", (
-            "Bright tones, overexposed, static, blurred details, subtitles, style, "
-            "works, paintings, images, static, overall gray, worst quality, low quality, "
-            "JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, "
-            "poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, "
-            "still picture, messy background, three legs, many people in the background, walking backwards"
-        ),
-        "--seed", "1024",
-        "--output-path", str(output_dir),
-    ]
-
-    subprocess.run(cmd, check=True)
-
-    assert output_dir.exists(), f"Output directory {output_dir} does not exist"
+    config = {
+        "generator": {
+            "model_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+            "engine": {
+                "num_gpus": 1,
+                "parallelism": {
+                    "tp_size": 1,
+                    "sp_size": 1
+                },
+                "offload": {
+                    "dit": False,
+                    "vae": False,
+                    "text_encoder": True,
+                    "pin_cpu_memory": False,
+                },
+            },
+            "pipeline": {
+                "experimental": {
+                    "flow_shift": 8.0,
+                },
+            },
+        },
+        "request": {
+            "prompt":
+            "A majestic lion strides across the golden savanna, "
+            "its powerful frame glistening under the warm afternoon "
+            "sun. The tall grass ripples gently in the breeze, "
+            "enhancing the lion's commanding presence. The tone is "
+            "vibrant, embodying the raw energy of the wild. Low "
+            "angle, steady tracking shot, cinematic.",
+            "negative_prompt":
+            "Bright tones, overexposed, static, blurred details, "
+            "subtitles, style, works, paintings, images, static, "
+            "overall gray, worst quality, low quality, JPEG "
+            "compression residue, ugly, incomplete, extra fingers, "
+            "poorly drawn hands, poorly drawn faces, deformed, "
+            "disfigured, misshapen limbs, fused fingers, still "
+            "picture, messy background, three legs, many people in "
+            "the background, walking backwards",
+            "sampling": {
+                "seed": 1024,
+                "num_frames": 77,
+                "height": 480,
+                "width": 832,
+                "fps": 16,
+                "num_inference_steps": 10,
+                "guidance_scale": 6.0,
+            },
+            "output": {
+                "output_path": str(output_dir),
+            },
+        },
+    }
+
+    with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".json", delete=False) as f:
+        json.dump(config, f)
+        config_path = f.name
+
+    try:
+        cmd = [
+            sys.executable, "-m", "fastvideo.entrypoints.cli.main",
+            "generate", "--config", config_path
+        ]
+        subprocess.run(cmd, check=True)
+    finally:
+        os.unlink(config_path)
+
+    assert output_dir.exists(), \
+        f"Output directory {output_dir} does not exist"
 
     video_files = list(output_dir.glob("*.mp4"))
     assert len(video_files) > 0, "No video files were generated"
 
     for video_file in video_files:
-        assert video_file.stat().st_size > 0, f"Video file {video_file} is empty"
+        assert video_file.stat().st_size > 0, \
+            f"Video file {video_file} is empty"
+
 
 if __name__ == "__main__":
-    test_inference_bsa()
\ No newline at end of file
+    test_inference_bsa()
diff --git a/fastvideo/tests/inference/vmoba/test_vmoba_inference.py b/fastvideo/tests/inference/vmoba/test_vmoba_inference.py
index 0b0b80021f..694b76a4ba 100644
--- a/fastvideo/tests/inference/vmoba/test_vmoba_inference.py
+++ b/fastvideo/tests/inference/vmoba/test_vmoba_inference.py
@@ -1,58 +1,96 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 import os
 import subprocess
+import tempfile
 from pathlib import Path
 
+
 def test_inference_vmoba():
     """Test FastVideo VMOBA_ATTN inference pipeline"""
 
-    num_gpus = "1"
-    model_base = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
     output_dir = Path("outputs_video/vmoba_1.3B/")
     moba_config = "fastvideo/configs/backend/vmoba/wan_1.3B_77_480_832.json"
 
     os.environ["FASTVIDEO_ATTENTION_BACKEND"] = "VMOBA_ATTN"
 
-    cmd = [
-        "fastvideo", "generate",
-        "--model-path", model_base,
-        "--sp-size", num_gpus,
-        "--tp-size", "1",
-        "--num-gpus", num_gpus,
-        "--dit-cpu-offload", "False",
-        "--vae-cpu-offload", "False",
-        "--text-encoder-cpu-offload", "True",
-        "--pin-cpu-memory", "False",
-        "--height", "480",
-        "--width", "832",
-        "--num-frames", "77",
-        "--num-inference-steps", "10",
-        "--moba-config-path", moba_config,
-        "--fps", "16",
-        "--guidance-scale", "6.0",
-        "--flow-shift", "8.0",
-        "--prompt", "A majestic lion strides across the golden savanna, its powerful frame glistening under the warm afternoon sun. The tall grass ripples gently in the breeze, enhancing the lion's commanding presence. The tone is vibrant, embodying the raw energy of the wild. Low angle, steady tracking shot, cinematic.",
-        "--negative-prompt", (
-            "Bright tones, overexposed, static, blurred details, subtitles, style, "
-            "works, paintings, images, static, overall gray, worst quality, low quality, "
-            "JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, "
-            "poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, "
-            "still picture, messy background, three legs, many people in the background, walking backwards"
-        ),
-        "--seed", "1024",
-        "--output-path", str(output_dir),
-    ]
-
-    subprocess.run(cmd, check=True)
-
-    assert output_dir.exists(), f"Output directory {output_dir} does not exist"
+    config = {
+        "generator": {
+            "model_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+            "engine": {
+                "num_gpus": 1,
+                "parallelism": {
+                    "tp_size": 1,
+                    "sp_size": 1
+                },
+                "offload": {
+                    "dit": False,
+                    "vae": False,
+                    "text_encoder": True,
+                    "pin_cpu_memory": False,
+                },
+            },
+            "pipeline": {
+                "experimental": {
+                    "flow_shift": 8.0,
+                    "moba_config_path": moba_config,
+                },
+            },
+        },
+        "request": {
+            "prompt":
+            "A majestic lion strides across the golden savanna, "
+            "its powerful frame glistening under the warm afternoon "
+            "sun. The tall grass ripples gently in the breeze, "
+            "enhancing the lion's commanding presence. The tone is "
+            "vibrant, embodying the raw energy of the wild. Low "
+            "angle, steady tracking shot, cinematic.",
+            "negative_prompt":
+            "Bright tones, overexposed, static, blurred details, "
+            "subtitles, style, works, paintings, images, static, "
+            "overall gray, worst quality, low quality, JPEG "
+            "compression residue, ugly, incomplete, extra fingers, "
+            "poorly drawn hands, poorly drawn faces, deformed, "
+            "disfigured, misshapen limbs, fused fingers, still "
+            "picture, messy background, three legs, many people in "
+            "the background, walking backwards",
+            "sampling": {
+                "seed": 1024,
+                "num_frames": 77,
+                "height": 480,
+                "width": 832,
+                "fps": 16,
+                "num_inference_steps": 10,
+                "guidance_scale": 6.0,
+            },
+            "output": {
+                "output_path": str(output_dir),
+            },
+        },
+    }
+
+    with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".json", delete=False) as f:
+        json.dump(config, f)
+        config_path = f.name
+
+    try:
+        cmd = ["fastvideo", "generate", "--config", config_path]
+        subprocess.run(cmd, check=True)
+    finally:
+        os.unlink(config_path)
+
+    assert output_dir.exists(), \
+        f"Output directory {output_dir} does not exist"
 
     video_files = list(output_dir.glob("*.mp4"))
     assert len(video_files) > 0, "No video files were generated"
 
     for video_file in video_files:
-        assert video_file.stat().st_size > 0, f"Video file {video_file} is empty"
+        assert video_file.stat().st_size > 0, \
+            f"Video file {video_file} is empty"
+
 
 if __name__ == "__main__":
     test_inference_vmoba()
diff --git a/fastvideo/utils.py b/fastvideo/utils.py
index 75b9bf5208..1ed770a257 100644
--- a/fastvideo/utils.py
+++ b/fastvideo/utils.py
@@ -189,10 +189,17 @@ def __init__(self, *args, **kwargs) -> None:
 
     def parse_args(  # type: ignore[override]
             self, args=None, namespace=None) -> argparse.Namespace:
+        namespace, unknown = self.parse_known_args(args, namespace)
+        if unknown:
+            self.error(f"unrecognized arguments: {' '.join(unknown)}")
+        return namespace
+
+    def parse_known_args(  # type: ignore[override]
+            self, args=None, namespace=None) -> tuple[argparse.Namespace, list[str]]:
         if args is None:
             args = sys.argv[1:]
 
-        if '--config' in args:
+        if '--config' in args and not self._should_defer_config_loading(args):
             args = self._pull_args_from_config(args)
 
         # Convert underscores to dashes and vice versa in argument names
@@ -201,10 +208,16 @@ def parse_args(  # type: ignore[override]
             if arg.startswith('--'):
                 if '=' in arg:
                     key, value = arg.split('=', 1)
-                    key = '--' + key[len('--'):].replace('_', '-')
+                    normalized_key = key[len('--'):]
+                    if '.' not in normalized_key:
+                        normalized_key = normalized_key.replace('_', '-')
+                    key = '--' + normalized_key
                     processed_args.append(f'{key}={value}')
                 else:
-                    processed_args.append('--' + arg[len('--'):].replace('_', '-'))
+                    normalized_key = arg[len('--'):]
+                    if '.' not in normalized_key:
+                        normalized_key = normalized_key.replace('_', '-')
+                    processed_args.append('--' + normalized_key)
             elif arg.startswith('-O') and arg != '-O' and len(arg) == 2:
                 # allow -O flag to be used without space, e.g. -O3
                 processed_args.append('-O')
@@ -212,7 +225,7 @@ def parse_args(  # type: ignore[override]
             else:
                 processed_args.append(arg)
 
-        namespace = super().parse_args(processed_args, namespace)
+        namespace, unknown = super().parse_known_args(processed_args, namespace)
 
         # Track which arguments were explicitly provided
         namespace._provided = set()
@@ -238,7 +251,15 @@ def parse_args(  # type: ignore[override]
             else:
                 i += 1
 
-        return namespace  # type: ignore[no-any-return]
+        return namespace, unknown  # type: ignore[no-any-return]
+
+    def _should_defer_config_loading(self, args: list[str]) -> bool:
+        if getattr(self, "defer_config_loading", False):
+            return True
+        subcommand = next((arg for arg in args if not arg.startswith('-')), None)
+        if subcommand in {"generate", "serve"}:
+            return True
+        return self.prog.split()[-1] in {"generate", "serve"}
 
     def _pull_args_from_config(self, args: list[str]) -> list[str]:
         """Method to pull arguments specified in the config file
diff --git a/scripts/inference/README.md b/scripts/inference/README.md
new file mode 100644
index 0000000000..a6837456bf
--- /dev/null
+++ b/scripts/inference/README.md
@@ -0,0 +1,34 @@
+# Inference Configs
+
+These files are nested inference configs for the config-first CLI.
+
+Run them with:
+
+```bash
+fastvideo generate --config scripts/inference/<config>.yaml
+```
+
+Or use the helper wrapper:
+
+```bash
+bash scripts/inference/run.sh scripts/inference/<config>.yaml
+```
+
+Override config values with dotted paths:
+
+```bash
+fastvideo generate --config scripts/inference/<config>.yaml \
+    --request.sampling.seed 42 \
+    --request.prompt "A panda skiing at sunset"
+```
+
+The same overrides work through the wrapper:
+
+```bash
+bash scripts/inference/run.sh scripts/inference/<config>.yaml \
+    --generator.engine.num_gpus 2 \
+    --request.output.output_path outputs/custom_run
+```
+
+Some configs require an attention backend environment variable. When needed,
+the file header shows the exact command to use.
diff --git a/scripts/inference/inference_fasthunyuan.yaml b/scripts/inference/inference_fasthunyuan.yaml
new file mode 100644
index 0000000000..f3c8a4038c
--- /dev/null
+++ b/scripts/inference/inference_fasthunyuan.yaml
@@ -0,0 +1,24 @@
+# Run with:
+#   FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN fastvideo generate --config scripts/inference/inference_fasthunyuan.yaml
+generator:
+  model_path: FastVideo/FastHunyuan-Diffusers
+  engine:
+    num_gpus: 4
+    parallelism:
+      tp_size: 1
+      sp_size: 4
+  pipeline:
+    experimental:
+      embedded_cfg_scale: 6
+      flow_shift: 17
+request:
+  prompt: A beautiful woman in a red dress walking down a street
+  sampling:
+    seed: 1024
+    num_frames: 125
+    height: 720
+    width: 1280
+    num_inference_steps: 6
+    guidance_scale: 1
+  output:
+    output_path: outputs_video/
diff --git a/scripts/inference/inference_hunyuan.yaml b/scripts/inference/inference_hunyuan.yaml
new file mode 100644
index 0000000000..2efe846b68
--- /dev/null
+++ b/scripts/inference/inference_hunyuan.yaml
@@ -0,0 +1,24 @@
+# Run with:
+#   FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN fastvideo generate --config scripts/inference/inference_hunyuan.yaml
+generator:
+  model_path: hunyuanvideo-community/HunyuanVideo
+  engine:
+    num_gpus: 4
+    parallelism:
+      tp_size: 1
+      sp_size: 4
+  pipeline:
+    experimental:
+      embedded_cfg_scale: 6
+      flow_shift: 7
+request:
+  prompt: A beautiful woman in a red dress walking down a street
+  sampling:
+    seed: 1024
+    num_frames: 125
+    height: 720
+    width: 1280
+    num_inference_steps: 50
+    guidance_scale: 1
+  output:
+    output_path: outputs_video/
diff --git a/scripts/inference/inference_longcat.yaml b/scripts/inference/inference_longcat.yaml
new file mode 100644
index 0000000000..7887a28a7b
--- /dev/null
+++ b/scripts/inference/inference_longcat.yaml
@@ -0,0 +1,43 @@
+# Run with:
+#   fastvideo generate --config scripts/inference/inference_longcat.yaml
+generator:
+  model_path: FastVideo/LongCat-Video-T2V-Diffusers
+  engine:
+    num_gpus: 1
+    parallelism:
+      tp_size: 1
+      sp_size: 1
+    offload:
+      dit: false
+      text_encoder: false
+      vae: false
+      pin_cpu_memory: false
+  pipeline:
+    experimental:
+      enable_bsa: false
+request:
+  prompt: >-
+    In a realistic photography style, a white boy around seven or eight years
+    old sits on a park bench, wearing a light blue T-shirt, denim shorts, and
+    white sneakers. He holds an ice cream cone with vanilla and chocolate
+    flavors, and beside him is a medium-sized golden Labrador. Smiling, the
+    boy offers the ice cream to the dog, who eagerly licks it with its tongue.
+    The sun is shining brightly, and the background features a green lawn and
+    several tall trees, creating a warm and loving scene.
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  sampling:
+    seed: 42
+    num_frames: 93
+    height: 480
+    width: 832
+    fps: 15
+    num_inference_steps: 50
+    guidance_scale: 4.0
+  output:
+    output_path: outputs_video/longcat_t2v
diff --git a/scripts/inference/inference_longcat_distill.yaml b/scripts/inference/inference_longcat_distill.yaml
new file mode 100644
index 0000000000..8e6b77d3dc
--- /dev/null
+++ b/scripts/inference/inference_longcat_distill.yaml
@@ -0,0 +1,44 @@
+# Run with:
+#   fastvideo generate --config scripts/inference/inference_longcat_distill.yaml
+generator:
+  model_path: FastVideo/LongCat-Video-T2V-Diffusers
+  engine:
+    num_gpus: 1
+    parallelism:
+      tp_size: 1
+      sp_size: 1
+    offload:
+      dit: false
+      pin_cpu_memory: false
+  pipeline:
+    components:
+      lora_path: FastVideo/LongCat-Video-T2V-Distilled-LoRA
+    experimental:
+      enable_bsa: false
+      lora_nickname: distilled
+request:
+  prompt: >-
+    In a realistic photography style, a white boy around seven or eight years
+    old sits on a park bench, wearing a light blue T-shirt, denim shorts, and
+    white sneakers. He holds an ice cream cone with vanilla and chocolate
+    flavors, and beside him is a medium-sized golden Labrador. Smiling, the
+    boy offers the ice cream to the dog, who eagerly licks it with its tongue.
+    The sun is shining brightly, and the background features a green lawn and
+    several tall trees, creating a warm and loving scene.
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  sampling:
+    seed: 42
+    num_frames: 93
+    height: 480
+    width: 832
+    fps: 15
+    num_inference_steps: 16
+    guidance_scale: 1.0
+  output:
+    output_path: outputs_video/longcat_distill
diff --git a/scripts/inference/inference_longcat_i2v.yaml b/scripts/inference/inference_longcat_i2v.yaml
new file mode 100644
index 0000000000..7ab21af18f
--- /dev/null
+++ b/scripts/inference/inference_longcat_i2v.yaml
@@ -0,0 +1,42 @@
+# Run with:
+#   fastvideo generate --config scripts/inference/inference_longcat_i2v.yaml
+generator:
+  model_path: FastVideo/LongCat-Video-I2V-Diffusers
+  engine:
+    num_gpus: 1
+    parallelism:
+      tp_size: 1
+      sp_size: 1
+    offload:
+      dit: false
+      pin_cpu_memory: false
+  pipeline:
+    workload_type: i2v
+    experimental:
+      enable_bsa: false
+request:
+  prompt: >-
+    A woman sits at a wooden table by the window in a cozy café. She reaches
+    out with her right hand, picks up the white coffee cup from the saucer,
+    and gently brings it to her lips to take a sip. After drinking, she places
+    the cup back on the table and looks out the window, enjoying the peaceful
+    atmosphere.
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  inputs:
+    image_path: assets/girl.png
+  sampling:
+    seed: 42
+    num_frames: 93
+    height: 480
+    width: 480
+    fps: 15
+    num_inference_steps: 50
+    guidance_scale: 4.0
+  output:
+    output_path: outputs_video/longcat_i2v
diff --git a/scripts/inference/inference_longcat_refine_fromvideo.yaml b/scripts/inference/inference_longcat_refine_fromvideo.yaml
new file mode 100644
index 0000000000..29046b6846
--- /dev/null
+++ b/scripts/inference/inference_longcat_refine_fromvideo.yaml
@@ -0,0 +1,51 @@
+# Run with:
+#   fastvideo generate --config scripts/inference/inference_longcat_refine_fromvideo.yaml
+generator:
+  model_path: FastVideo/LongCat-Video-T2V-Diffusers
+  engine:
+    num_gpus: 1
+    parallelism:
+      tp_size: 1
+      sp_size: 1
+    offload:
+      pin_cpu_memory: false
+  pipeline:
+    components:
+      lora_path: FastVideo/LongCat-Video-T2V-Refinement-LoRA
+    experimental:
+      enable_bsa: true
+      bsa_sparsity: 0.875
+      bsa_chunk_q: [4, 4, 8]
+      bsa_chunk_k: [4, 4, 8]
+      lora_nickname: refinement
+request:
+  prompt: >-
+    In a realistic photography style, a white boy around seven or eight years
+    old sits on a park bench, wearing a light blue T-shirt, denim shorts, and
+    white sneakers. He holds an ice cream cone with vanilla and chocolate
+    flavors, and beside him is a medium-sized golden Labrador. Smiling, the
+    boy offers the ice cream to the dog, who eagerly licks it with its tongue.
+    The sun is shining brightly, and the background features a green lawn and
+    several tall trees, creating a warm and loving scene.
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  inputs:
+    refine_from: outputs_video/longcat_distill/In a realistic photography style, a white boy around seven or eight years old sits on a park bench,.mp4
+  sampling:
+    seed: 42
+    height: 720
+    width: 1280
+    fps: 30
+    num_inference_steps: 50
+    guidance_scale: 1.0
+  extensions:
+    t_thresh: 0.5
+    spatial_refine_only: false
+    num_cond_frames: 0
+  output:
+    output_path: outputs_video/longcat_refine_720p
diff --git a/scripts/inference/inference_longcat_vc.yaml b/scripts/inference/inference_longcat_vc.yaml
new file mode 100644
index 0000000000..0fe5c78b3c
--- /dev/null
+++ b/scripts/inference/inference_longcat_vc.yaml
@@ -0,0 +1,43 @@
+# Run with:
+#   fastvideo generate --config scripts/inference/inference_longcat_vc.yaml
+generator:
+  model_path: FastVideo/LongCat-Video-VC-Diffusers
+  engine:
+    num_gpus: 1
+    parallelism:
+      tp_size: 1
+      sp_size: 1
+    offload:
+      dit: false
+      pin_cpu_memory: false
+  pipeline:
+    experimental:
+      enable_bsa: false
+request:
+  prompt: >-
+    A person rides a motorcycle along a long, straight road that stretches
+    between a body of water and a forested hillside. The rider steadily
+    accelerates, keeping the motorcycle centered between the guardrails, while
+    the scenery passes by on both sides. The video captures the journey from
+    the rider's perspective, emphasizing the sense of motion and adventure.
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  inputs:
+    video_path: assets/motorcycle.mp4
+  sampling:
+    seed: 42
+    num_frames: 93
+    height: 480
+    width: 832
+    fps: 15
+    num_inference_steps: 50
+    guidance_scale: 4.0
+  extensions:
+    num_cond_frames: 13
+  output:
+    output_path: outputs_video/longcat_vc
diff --git a/scripts/inference/inference_wan.yaml b/scripts/inference/inference_wan.yaml
new file mode 100644
index 0000000000..2baa89debb
--- /dev/null
+++ b/scripts/inference/inference_wan.yaml
@@ -0,0 +1,36 @@
+# Run with:
+#   fastvideo generate --config scripts/inference/inference_wan.yaml
+generator:
+  model_path: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+  engine:
+    num_gpus: 1
+    parallelism:
+      tp_size: 1
+      sp_size: 1
+    offload:
+      dit: false
+      vae: false
+      pin_cpu_memory: false
+  pipeline:
+    experimental:
+      flow_shift: 8.0
+request:
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  inputs:
+    prompt_path: assets/prompt.txt
+  sampling:
+    seed: 1024
+    num_frames: 77
+    height: 480
+    width: 832
+    fps: 16
+    num_inference_steps: 50
+    guidance_scale: 6.0
+  output:
+    output_path: outputs_video/
diff --git a/scripts/inference/inference_wan_1.3B_VMoba.yaml b/scripts/inference/inference_wan_1.3B_VMoba.yaml
new file mode 100644
index 0000000000..02a65dc101
--- /dev/null
+++ b/scripts/inference/inference_wan_1.3B_VMoba.yaml
@@ -0,0 +1,37 @@
+# Run with:
+#   FASTVIDEO_ATTENTION_BACKEND=VMOBA_ATTN fastvideo generate --config scripts/inference/inference_wan_1.3B_VMoba.yaml
+generator:
+  model_path: FastVideo/Wan2.1-T2V-1.3B-Diffusers
+  engine:
+    num_gpus: 1
+    parallelism:
+      tp_size: 1
+      sp_size: 1
+    offload:
+      dit: false
+      vae: false
+      pin_cpu_memory: false
+  pipeline:
+    experimental:
+      flow_shift: 8.0
+      moba_config_path: fastvideo/configs/backend/vmoba/wan_1.3B_77_480_832.json
+request:
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  inputs:
+    prompt_path: assets/prompt.txt
+  sampling:
+    seed: 1024
+    num_frames: 77
+    height: 480
+    width: 832
+    fps: 16
+    num_inference_steps: 50
+    guidance_scale: 6.0
+  output:
+    output_path: outputs_video/
diff --git a/scripts/inference/inference_wan_VSA.yaml b/scripts/inference/inference_wan_VSA.yaml
new file mode 100644
index 0000000000..d92d94bcdd
--- /dev/null
+++ b/scripts/inference/inference_wan_VSA.yaml
@@ -0,0 +1,37 @@
+# Run with:
+#   FASTVIDEO_ATTENTION_BACKEND=VIDEO_SPARSE_ATTN fastvideo generate --config scripts/inference/inference_wan_VSA.yaml
+generator:
+  model_path: FastVideo/Wan2.1-VSA-T2V-14B-720P-Diffusers
+  engine:
+    num_gpus: 1
+    parallelism:
+      tp_size: 1
+      sp_size: 1
+    offload:
+      dit: false
+      vae: false
+      pin_cpu_memory: false
+  pipeline:
+    experimental:
+      flow_shift: 5.0
+      VSA_sparsity: 0.9
+request:
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  inputs:
+    prompt_path: assets/prompt.txt
+  sampling:
+    seed: 1024
+    num_frames: 77
+    height: 448
+    width: 832
+    fps: 16
+    num_inference_steps: 50
+    guidance_scale: 5.0
+  output:
+    output_path: outputs_Wan-VSA-14B/
diff --git a/scripts/inference/inference_wan_VSA_DMD_14B_480P.yaml b/scripts/inference/inference_wan_VSA_DMD_14B_480P.yaml
new file mode 100644
index 0000000000..a184c1093b
--- /dev/null
+++ b/scripts/inference/inference_wan_VSA_DMD_14B_480P.yaml
@@ -0,0 +1,34 @@
+# Run with:
+#   FASTVIDEO_ATTENTION_BACKEND=VIDEO_SPARSE_ATTN fastvideo generate --config scripts/inference/inference_wan_VSA_DMD_14B_480P.yaml
+generator:
+  model_path: FastVideo/FastWan2.1-T2V-14B-480P-Diffusers
+  engine:
+    num_gpus: 1
+    compile:
+      enabled: true
+    parallelism:
+      tp_size: 1
+      sp_size: 1
+  pipeline:
+    experimental:
+      VSA_sparsity: 0.9
+      dmd_denoising_steps: [1000, 757, 522]
+request:
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  inputs:
+    prompt_path: assets/prompt.txt
+  sampling:
+    seed: 1024
+    num_frames: 81
+    height: 480
+    width: 832
+    fps: 16
+    num_inference_steps: 3
+  output:
+    output_path: outputs_video_dmd_14B/
diff --git a/scripts/inference/inference_wan_VSA_DMD_14B_720P.yaml b/scripts/inference/inference_wan_VSA_DMD_14B_720P.yaml
new file mode 100644
index 0000000000..395bb1a3cc
--- /dev/null
+++ b/scripts/inference/inference_wan_VSA_DMD_14B_720P.yaml
@@ -0,0 +1,34 @@
+# Run with:
+#   FASTVIDEO_ATTENTION_BACKEND=VIDEO_SPARSE_ATTN fastvideo generate --config scripts/inference/inference_wan_VSA_DMD_14B_720P.yaml
+generator:
+  model_path: FastVideo/FastWan2.1-T2V-14B-480P-Diffusers
+  engine:
+    num_gpus: 1
+    compile:
+      enabled: true
+    parallelism:
+      tp_size: 1
+      sp_size: 1
+  pipeline:
+    experimental:
+      VSA_sparsity: 0.9
+      dmd_denoising_steps: [1000, 757, 522]
+request:
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  inputs:
+    prompt_path: assets/prompt.txt
+  sampling:
+    seed: 1024
+    num_frames: 81
+    height: 720
+    width: 1280
+    fps: 16
+    num_inference_steps: 3
+  output:
+    output_path: outputs_video_dmd_14B_720P/
diff --git a/scripts/inference/inference_wan_VSA_DMD_1_3B.yaml b/scripts/inference/inference_wan_VSA_DMD_1_3B.yaml
new file mode 100644
index 0000000000..4c99c29a8e
--- /dev/null
+++ b/scripts/inference/inference_wan_VSA_DMD_1_3B.yaml
@@ -0,0 +1,38 @@
+# Run with:
+#   FASTVIDEO_ATTENTION_BACKEND=VIDEO_SPARSE_ATTN fastvideo generate --config scripts/inference/inference_wan_VSA_DMD_1_3B.yaml
+generator:
+  model_path: FastVideo/FastWan2.1-T2V-1.3B-Diffusers
+  engine:
+    num_gpus: 1
+    compile:
+      enabled: true
+    parallelism:
+      tp_size: 1
+      sp_size: 1
+    offload:
+      dit: false
+      vae: false
+      pin_cpu_memory: false
+  pipeline:
+    experimental:
+      VSA_sparsity: 0.8
+      dmd_denoising_steps: [1000, 757, 522]
+request:
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  inputs:
+    prompt_path: assets/prompt.txt
+  sampling:
+    seed: 1024
+    num_frames: 81
+    height: 480
+    width: 832
+    fps: 16
+    num_inference_steps: 3
+  output:
+    output_path: outputs_video_dmd_1.3B/
diff --git a/scripts/inference/inference_wan_VSA_DMD_5B_720P.yaml b/scripts/inference/inference_wan_VSA_DMD_5B_720P.yaml
new file mode 100644
index 0000000000..d0f6dc0c57
--- /dev/null
+++ b/scripts/inference/inference_wan_VSA_DMD_5B_720P.yaml
@@ -0,0 +1,33 @@
+# Run with:
+#   FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN fastvideo generate --config scripts/inference/inference_wan_VSA_DMD_5B_720P.yaml
+generator:
+  model_path: FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers
+  engine:
+    num_gpus: 1
+    compile:
+      enabled: true
+    parallelism:
+      tp_size: 1
+      sp_size: 1
+  pipeline:
+    experimental:
+      dmd_denoising_steps: [1000, 757, 522]
+request:
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  inputs:
+    prompt_path: assets/prompt.txt
+  sampling:
+    seed: 1024
+    num_frames: 104
+    height: 704
+    width: 1280
+    fps: 24
+    num_inference_steps: 3
+  output:
+    output_path: outputs_video_dmd_5B_720P/
diff --git a/scripts/inference/inference_wan_i2v.yaml b/scripts/inference/inference_wan_i2v.yaml
new file mode 100644
index 0000000000..53585b4829
--- /dev/null
+++ b/scripts/inference/inference_wan_i2v.yaml
@@ -0,0 +1,43 @@
+# Run with:
+#   fastvideo generate --config scripts/inference/inference_wan_i2v.yaml
+generator:
+  model_path: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers
+  engine:
+    num_gpus: 2
+    parallelism:
+      tp_size: 2
+      sp_size: 2
+    offload:
+      dit: true
+      vae: true
+      text_encoder: true
+      image_encoder: true
+      pin_cpu_memory: true
+  pipeline:
+    workload_type: i2v
+    experimental:
+      flow_shift: 3.0
+request:
+  prompt: >-
+    An astronaut hatching from an egg, on the surface of the moon, the
+    darkness and depth of space realised in the background. High quality,
+    ultrarealistic detail and breath-taking movie-like camera shot.
+  negative_prompt: >-
+    Bright tones, overexposed, static, blurred details, subtitles, style,
+    works, paintings, images, static, overall gray, worst quality, low
+    quality, JPEG compression residue, ugly, incomplete, extra fingers,
+    poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen
+    limbs, fused fingers, still picture, messy background, three legs, many
+    people in the background, walking backwards
+  inputs:
+    image_path: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg
+  sampling:
+    seed: 1024
+    num_frames: 77
+    height: 480
+    width: 832
+    fps: 16
+    num_inference_steps: 40
+    guidance_scale: 5.0
+  output:
+    output_path: outputs_i2v/
diff --git a/scripts/inference/run.sh b/scripts/inference/run.sh
new file mode 100755
index 0000000000..9721c374c6
--- /dev/null
+++ b/scripts/inference/run.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# Launch inference from a nested config.
+#
+# Usage:
+#   bash scripts/inference/run.sh <config.yaml> [--dotted.key value ...]
+#
+# Examples:
+#   bash scripts/inference/run.sh scripts/inference/inference_wan.yaml
+#   bash scripts/inference/run.sh scripts/inference/inference_wan.yaml \
+#       --request.sampling.seed 42 \
+#       --generator.engine.num_gpus 2
+
+set -euo pipefail
+
+CONFIG="${1:?Usage: $0 <config.yaml> [--dotted.key value ...]}"
+shift
+
+echo "=== FastVideo Inference ==="
+echo "Config: ${CONFIG}"
+echo "Extra args: $*"
+echo "==========================="
+
+fastvideo generate --config "${CONFIG}" "$@"
diff --git a/scripts/inference/v1_inference_fasthunyuan.sh b/scripts/inference/v1_inference_fasthunyuan.sh
deleted file mode 100644
index e80fec75fa..0000000000
--- a/scripts/inference/v1_inference_fasthunyuan.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-num_gpus=4
-export MODEL_BASE=FastVideo/FastHunyuan-Diffusers
-export FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --height 720 \
-    --width 1280 \
-    --num-frames 125 \
-    --num-inference-steps 6 \
-    --guidance-scale 1 \
-    --embedded-cfg-scale 6 \
-    --flow-shift 17 \
-    --prompt "A beautiful woman in a red dress walking down a street" \
-    --seed 1024 \
-    --output-path outputs_video/
\ No newline at end of file
diff --git a/scripts/inference/v1_inference_hunyuan.sh b/scripts/inference/v1_inference_hunyuan.sh
deleted file mode 100644
index 529305f3c7..0000000000
--- a/scripts/inference/v1_inference_hunyuan.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-
-num_gpus=4
-export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
-export FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN
-# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --height 720 \
-    --width 1280 \
-    --num-frames 125 \
-    --num-inference-steps 50 \
-    --guidance-scale 1 \
-    --embedded-cfg-scale 6 \
-    --flow-shift 7 \
-    --prompt "A beautiful woman in a red dress walking down a street" \
-    --seed 1024 \
-    --output-path outputs_video/
diff --git a/scripts/inference/v1_inference_longcat.sh b/scripts/inference/v1_inference_longcat.sh
deleted file mode 100755
index 80590b2052..0000000000
--- a/scripts/inference/v1_inference_longcat.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-# LongCat Text-to-Video (T2V) Inference Script
-# 
-# This script runs LongCat T2V inference using the fastvideo CLI.
-#
-# Usage:
-#   bash scripts/inference/v1_inference_longcat.sh
-#
-# Prerequisites:
-#   - Install fastvideo: pip install -e .
-#   - The model weights will be auto-downloaded from HuggingFace
-
-num_gpus=1
-
-export FASTVIDEO_ATTENTION_BACKEND=
-
-# Model path options:
-# Option 1: HuggingFace model (auto-downloaded)
-export MODEL_BASE=FastVideo/LongCat-Video-T2V-Diffusers
-
-# Option 2: Local weights (uncomment if you have local weights)
-# For local weights, convert the official weights to FastVideo native format
-# conversion method: python scripts/checkpoint_conversion/longcat_to_fastvideo.py
-# --source /path/to/LongCat-Video/weights/LongCat-Video
-# --output weights/longcat-native
-# export MODEL_BASE=weights/longcat-native
-
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --dit-cpu-offload False \
-    --vae-cpu-offload False \
-    --text-encoder-cpu-offload False \
-    --pin-cpu-memory False \
-    --enable-bsa False \
-    --height 480 \
-    --width 832 \
-    --num-frames 93 \
-    --num-inference-steps 50 \
-    --fps 15 \
-    --guidance-scale 4.0 \
-    --prompt "In a realistic photography style, a white boy around seven or eight years old sits on a park bench, wearing a light blue T-shirt, denim shorts, and white sneakers. He holds an ice cream cone with vanilla and chocolate flavors, and beside him is a medium-sized golden Labrador. Smiling, the boy offers the ice cream to the dog, who eagerly licks it with its tongue. The sun is shining brightly, and the background features a green lawn and several tall trees, creating a warm and loving scene." \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 42 \
-    --output-path outputs_video/longcat_t2v
diff --git a/scripts/inference/v1_inference_longcat_distill.sh b/scripts/inference/v1_inference_longcat_distill.sh
deleted file mode 100755
index 40e376ff7a..0000000000
--- a/scripts/inference/v1_inference_longcat_distill.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-
-# LongCat T2V Distilled Inference Script
-# 
-# This script runs LongCat T2V with distillation LoRA (16 steps instead of 50).
-# Uses the distilled LoRA for faster generation.
-#
-# Usage:
-#   bash scripts/inference/v1_inference_longcat_distill.sh
-#
-# Prerequisites:
-#   - Install fastvideo: pip install -e .
-#   - The model weights will be auto-downloaded from HuggingFace
-
-num_gpus=1
-
-export FASTVIDEO_ATTENTION_BACKEND=
-
-# Model path - HuggingFace model (auto-downloaded)
-export MODEL_BASE=FastVideo/LongCat-Video-T2V-Diffusers
-
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --dit-cpu-offload False \
-    --vae-cpu-offload True \
-    --text-encoder-cpu-offload True \
-    --pin-cpu-memory False \
-    --enable-bsa False \
-    --lora-path "FastVideo/LongCat-Video-T2V-Distilled-LoRA" \
-    --lora-nickname "distilled" \
-    --height 480 \
-    --width 832 \
-    --num-frames 93 \
-    --num-inference-steps 16 \
-    --fps 15 \
-    --guidance-scale 1.0 \
-    --prompt "In a realistic photography style, a white boy around seven or eight years old sits on a park bench, wearing a light blue T-shirt, denim shorts, and white sneakers. He holds an ice cream cone with vanilla and chocolate flavors, and beside him is a medium-sized golden Labrador. Smiling, the boy offers the ice cream to the dog, who eagerly licks it with its tongue. The sun is shining brightly, and the background features a green lawn and several tall trees, creating a warm and loving scene." \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 42 \
-    --output-path outputs_video/longcat_distill
diff --git a/scripts/inference/v1_inference_longcat_i2v.sh b/scripts/inference/v1_inference_longcat_i2v.sh
deleted file mode 100644
index 1eafccd04d..0000000000
--- a/scripts/inference/v1_inference_longcat_i2v.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-# LongCat Image-to-Video (I2V) Inference Script
-# 
-# This script runs LongCat I2V inference using the fastvideo CLI.
-# LongCat I2V takes an input image and generates a video from it.
-#
-# Usage:
-#   bash scripts/inference/v1_inference_longcat_i2v.sh
-#
-# Prerequisites:
-#   - Install fastvideo: pip install -e .
-#   - The model weights will be auto-downloaded from HuggingFace
-#   - Or use local weights if you have them
-
-num_gpus=1
-
-export FASTVIDEO_ATTENTION_BACKEND=
-
-# Model path options:
-# Option 1: HuggingFace model (auto-downloaded)
-export MODEL_BASE=FastVideo/LongCat-Video-I2V-Diffusers
-
-# Option 2: Local weights (uncomment if you have local weights)
-# export MODEL_BASE=weights/longcat-for-i2v
-
-# Input image path (must be square for LongCat I2V)
-IMAGE_PATH="assets/girl.png"
-
-# Check if image exists
-if [ ! -f "$IMAGE_PATH" ]; then
-    echo "Error: Image not found at $IMAGE_PATH"
-    echo "Please provide a valid image path"
-    exit 1
-fi
-
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --dit-cpu-offload False \
-    --vae-cpu-offload True \
-    --text-encoder-cpu-offload True \
-    --pin-cpu-memory False \
-    --enable-bsa False \
-    --image-path "$IMAGE_PATH" \
-    --height 480 \
-    --width 480 \
-    --num-frames 93 \
-    --num-inference-steps 50 \
-    --fps 15 \
-    --guidance-scale 4.0 \
-    --prompt "A woman sits at a wooden table by the window in a cozy café. She reaches out with her right hand, picks up the white coffee cup from the saucer, and gently brings it to her lips to take a sip. After drinking, she places the cup back on the table and looks out the window, enjoying the peaceful atmosphere." \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 42 \
-    --output-path outputs_video/longcat_i2v
-
-
-
diff --git a/scripts/inference/v1_inference_longcat_refine_fromvideo.sh b/scripts/inference/v1_inference_longcat_refine_fromvideo.sh
deleted file mode 100755
index 819e1b870b..0000000000
--- a/scripts/inference/v1_inference_longcat_refine_fromvideo.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/bin/bash
-
-# LongCat T2V Refinement Script (480p -> 720p)
-# 
-# This script refines a 480p distilled video to 720p using the refinement LoRA.
-# Run v1_inference_longcat_distill.sh first to generate the 480p video.
-#
-# Usage:
-#   bash scripts/inference/v1_inference_longcat_refine_fromvideo.sh
-#
-# Prerequisites:
-#   - Install fastvideo: pip install -e .
-#   - The model weights will be auto-downloaded from HuggingFace
-#   - Run v1_inference_longcat_distill.sh first to generate input video
-
-num_gpus=1
-
-export FASTVIDEO_ATTENTION_BACKEND=
-
-# Model path - HuggingFace model (auto-downloaded)
-export MODEL_BASE=FastVideo/LongCat-Video-T2V-Diffusers
-
-INPUT_VIDEO="outputs_video/longcat_distill/In a realistic photography style, a white boy around seven or eight years old sits on a park bench,.mp4"
-REFINE_OUTPUT="outputs_video/longcat_refine_720p"
-
-# Prompt used for base generation (must match distill script)
-PROMPT="In a realistic photography style, a white boy around seven or eight years old sits on a park bench, wearing a light blue T-shirt, denim shorts, and white sneakers. He holds an ice cream cone with vanilla and chocolate flavors, and beside him is a medium-sized golden Labrador. Smiling, the boy offers the ice cream to the dog, who eagerly licks it with its tongue. The sun is shining brightly, and the background features a green lawn and several tall trees, creating a warm and loving scene."
-
-echo "=========================================="
-echo "LongCat 480p -> 720p Refinement"
-echo "=========================================="
-echo ""
-echo "Input:  $INPUT_VIDEO"
-echo "Output: $REFINE_OUTPUT"
-echo ""
-
-# Check if input video exists
-if [ ! -f "$INPUT_VIDEO" ]; then
-    echo "Error: Input video not found: $INPUT_VIDEO"
-    echo "Please run v1_inference_longcat_distill.sh first to generate the 480p video"
-    exit 1
-fi
-
-echo "Configuring refinement (BSA enabled, refinement LoRA)..."
-echo "Input video: $INPUT_VIDEO"
-echo "BSA enabled with sparsity=0.875"
-echo "Refinement LoRA loaded"
-echo ""
-
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --dit-cpu-offload True \
-    --vae-cpu-offload True \
-    --text-encoder-cpu-offload True \
-    --pin-cpu-memory False \
-    --enable-bsa True \
-    --bsa-sparsity 0.875 \
-    --bsa-chunk-q 4 4 8 \
-    --bsa-chunk-k 4 4 8 \
-    --lora-path "FastVideo/LongCat-Video-T2V-Refinement-LoRA" \
-    --lora-nickname "refinement" \
-    --refine-from "$INPUT_VIDEO" \
-    --t-thresh 0.5 \
-    --spatial-refine-only False \
-    --num-cond-frames 0 \
-    --height 720 \
-    --width 1280 \
-    --num-inference-steps 50 \
-    --fps 30 \
-    --guidance-scale 1.0 \
-    --prompt "$PROMPT" \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 42 \
-    --output-path "$REFINE_OUTPUT"
-
-echo ""
-echo "=========================================="
-echo "Refinement Complete!"
-echo "=========================================="
-echo ""
-echo "Output directory: $REFINE_OUTPUT"
-echo ""
-
diff --git a/scripts/inference/v1_inference_longcat_vc.sh b/scripts/inference/v1_inference_longcat_vc.sh
deleted file mode 100644
index b7c2aea3ad..0000000000
--- a/scripts/inference/v1_inference_longcat_vc.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-
-# LongCat Video Continuation (VC) Inference Script
-# 
-# This script runs LongCat VC inference using the fastvideo CLI.
-# LongCat VC takes an input video and generates a continuation of it.
-#
-# Usage:
-#   bash scripts/inference/v1_inference_longcat_vc.sh
-#
-# Prerequisites:
-#   - Install fastvideo: pip install -e .
-#   - The model weights will be auto-downloaded from HuggingFace
-#   - Or use local weights if you have them
-
-num_gpus=1
-
-export FASTVIDEO_ATTENTION_BACKEND=
-
-# Model path options:
-# Option 1: HuggingFace model (auto-downloaded)
-export MODEL_BASE=FastVideo/LongCat-Video-VC-Diffusers
-
-# Option 2: Local weights (uncomment if you have local weights)
-# export MODEL_BASE=weights/longcat-vc-upload
-
-# Input video path
-VIDEO_PATH="assets/motorcycle.mp4"
-
-# Check if video exists
-if [ ! -f "$VIDEO_PATH" ]; then
-    echo "Error: Video not found at $VIDEO_PATH"
-    echo "Please provide a valid video path"
-    exit 1
-fi
-
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --dit-cpu-offload False \
-    --vae-cpu-offload True \
-    --text-encoder-cpu-offload True \
-    --pin-cpu-memory False \
-    --enable-bsa False \
-    --video-path "$VIDEO_PATH" \
-    --num-cond-frames 13 \
-    --height 480 \
-    --width 832 \
-    --num-frames 93 \
-    --num-inference-steps 50 \
-    --fps 15 \
-    --guidance-scale 4.0 \
-    --prompt "A person rides a motorcycle along a long, straight road that stretches between a body of water and a forested hillside. The rider steadily accelerates, keeping the motorcycle centered between the guardrails, while the scenery passes by on both sides. The video captures the journey from the rider's perspective, emphasizing the sense of motion and adventure." \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 42 \
-    --output-path outputs_video/longcat_vc
-
-
-
diff --git a/scripts/inference/v1_inference_wan.sh b/scripts/inference/v1_inference_wan.sh
deleted file mode 100755
index 42d030164b..0000000000
--- a/scripts/inference/v1_inference_wan.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-num_gpus=1
-export FASTVIDEO_ATTENTION_BACKEND=
-export MODEL_BASE=Wan-AI/Wan2.1-T2V-1.3B-Diffusers
-# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
-# You can either use --prompt or --prompt-txt, but not both.
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --dit-cpu-offload False \
-    --vae-cpu-offload False \
-    --text-encoder-cpu-offload True \
-    --pin-cpu-memory False \
-    --height 480 \
-    --width 832 \
-    --num-frames 77 \
-    --num-inference-steps 50 \
-    --fps 16 \
-    --guidance-scale 6.0 \
-    --flow-shift 8.0 \
-    --prompt-txt assets/prompt.txt \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 1024 \
-    --output-path outputs_video/
\ No newline at end of file
diff --git a/scripts/inference/v1_inference_wan_1.3B_VMoba.sh b/scripts/inference/v1_inference_wan_1.3B_VMoba.sh
deleted file mode 100755
index 2332de0d33..0000000000
--- a/scripts/inference/v1_inference_wan_1.3B_VMoba.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-num_gpus=1
-export FASTVIDEO_ATTENTION_BACKEND=VMOBA_ATTN
-export MODEL_BASE=FastVideo/Wan2.1-T2V-1.3B-Diffusers
-# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
-# You can either use --prompt or --prompt-txt, but not both.
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --dit-cpu-offload False \
-    --vae-cpu-offload False \
-    --text-encoder-cpu-offload True \
-    --pin-cpu-memory False \
-    --height 480 \
-    --width 832 \
-    --num-frames 77 \
-    --num-inference-steps 50 \
-    --moba-config-path fastvideo/configs/backend/vmoba/wan_1.3B_77_480_832.json \
-    --fps 16 \
-    --guidance-scale 6.0 \
-    --flow-shift 8.0 \
-    --prompt-txt assets/prompt.txt \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 1024 \
-    --output-path outputs_video/
diff --git a/scripts/inference/v1_inference_wan_VSA.sh b/scripts/inference/v1_inference_wan_VSA.sh
deleted file mode 100755
index 02dcde417b..0000000000
--- a/scripts/inference/v1_inference_wan_VSA.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-num_gpus=1
-export FASTVIDEO_ATTENTION_BACKEND=VIDEO_SPARSE_ATTN
-# change model path to local dir if you want to inference using your checkpoint
-export MODEL_BASE=FastVideo/Wan2.1-VSA-T2V-14B-720P-Diffusers
-# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo 
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --dit-cpu-offload False \
-    --vae-cpu-offload False \
-    --text-encoder-cpu-offload True \
-    --pin-cpu-memory False \
-    --height 448 \
-    --width 832 \
-    --num-frames 77 \
-    --num-inference-steps 50 \
-    --fps 16 \
-    --guidance-scale 5.0 \
-    --flow-shift 5.0 \
-    --VSA-sparsity 0.9 \
-    --prompt-txt assets/prompt.txt \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 1024 \
-    --output-path outputs_Wan-VSA-14B/
\ No newline at end of file
diff --git a/scripts/inference/v1_inference_wan_VSA_DMD.sh b/scripts/inference/v1_inference_wan_VSA_DMD.sh
deleted file mode 100755
index 5be11f8b1b..0000000000
--- a/scripts/inference/v1_inference_wan_VSA_DMD.sh
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/bin/bash
-
-# 480P 1.3B runnable on 4090
-num_gpus=1
-export FASTVIDEO_ATTENTION_BACKEND=VIDEO_SPARSE_ATTN
-export MODEL_BASE=FastVideo/FastWan2.1-T2V-1.3B-Diffusers
-# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
-# You can either use --prompt or --prompt-txt, but not both.
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --height 480 \
-    --width 832 \
-    --num-frames 81 \
-    --num-inference-steps 3 \
-    --dit-cpu-offload False \
-    --vae-cpu-offload False \
-    --text-encoder-cpu-offload True \
-    --pin-cpu-memory False \
-    --fps 16 \
-    --prompt-txt assets/prompt.txt \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 1024 \
-    --output-path outputs_video_dmd_1.3B/ \
-    --VSA-sparsity 0.8 \
-    --dmd-denoising-steps "1000,757,522" \
-    --enable_torch_compile 
-
-
-
-# 480P 14B
-num_gpus=1
-export FASTVIDEO_ATTENTION_BACKEND=VIDEO_SPARSE_ATTN
-export MODEL_BASE=FastVideo/FastWan2.1-T2V-14B-480P-Diffusers
-# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
-# You can either use --prompt or --prompt-txt, but not both.
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --height 480 \
-    --width 832 \
-    --num-frames 81 \
-    --num-inference-steps 3 \
-    --fps 16 \
-    --prompt-txt assets/prompt.txt \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 1024 \
-    --output-path outputs_video_dmd_14B/ \
-    --VSA-sparsity 0.9 \
-    --dmd-denoising-steps "1000,757,522" \
-    --enable_torch_compile 
-
-
-
-# 720P 14B
-num_gpus=1
-export FASTVIDEO_ATTENTION_BACKEND=VIDEO_SPARSE_ATTN
-export MODEL_BASE=FastVideo/FastWan2.1-T2V-14B-480P-Diffusers
-# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
-# You can either use --prompt or --prompt-txt, but not both.
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --height 720 \
-    --width 1280 \
-    --num-frames 81 \
-    --num-inference-steps 3 \
-    --fps 16 \
-    --prompt-txt assets/prompt.txt \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 1024 \
-    --output-path outputs_video_dmd_14B_720P/ \
-    --VSA-sparsity 0.9 \
-    --dmd-denoising-steps "1000,757,522" \
-    --enable_torch_compile 
-
-# 720P 5B
-num_gpus=1
-export FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN
-export MODEL_BASE=FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers
-# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
-# You can either use --prompt or --prompt-txt, but not both.
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size 1 \
-    --num-gpus $num_gpus \
-    --height 704 \
-    --width 1280 \
-    --num-frames 104 \
-    --num-inference-steps 3 \
-    --fps 24 \
-    --prompt-txt assets/prompt.txt \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 1024 \
-    --output-path outputs_video_dmd_5B_720P/ \
-    --dmd-denoising-steps "1000,757,522" \
-    --enable_torch_compile 
\ No newline at end of file
diff --git a/scripts/inference/v1_inference_wan_i2v.sh b/scripts/inference/v1_inference_wan_i2v.sh
deleted file mode 100644
index c2ef35fc7d..0000000000
--- a/scripts/inference/v1_inference_wan_i2v.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-num_gpus=2
-export FASTVIDEO_ATTENTION_BACKEND=
-export MODEL_BASE=Wan-AI/Wan2.1-I2V-14B-480P-Diffusers
-# export MODEL_BASE=hunyuanvideo-community/HunyuanVideo
-fastvideo generate \
-    --model-path $MODEL_BASE \
-    --sp-size $num_gpus \
-    --tp-size $num_gpus \
-    --num-gpus $num_gpus \
-    --height 480 \
-    --width 832 \
-    --num-frames 77 \
-    --num-inference-steps 40 \
-    --fps 16 \
-    --flow-shift 3.0 \
-    --guidance-scale 5.0 \
-    --image-path "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg" \
-    --prompt "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot." \
-    --negative-prompt "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards" \
-    --seed 1024 \
-    --output-path outputs_i2v/
\ No newline at end of file