forked from NVIDIA/TensorRT-LLM
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathflux_transforms.yaml
More file actions
30 lines (27 loc) · 803 Bytes
/
flux_transforms.yaml
File metadata and controls
30 lines (27 loc) · 803 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# Configuration for Flux model transforms (export, optimizer, compile)
# Usage: python build_and_run_flux.py --config flux_transforms.yaml
# Export configuration
export:
clone: false
strict: false
# TODO: Integrate these transforms into the optimizer
# Optimizer configuration - FP8/FP4 quantization and fusion
# optimizer:
# quantize_fp8_from_graph:
# stage: "pattern_matcher"
# quantize_nvfp4_from_graph:
# stage: "pattern_matcher"
# fuse_fp8_gemms:
# stage: "post_load_fusion"
# fuse_fp4_gemms:
# stage: "post_load_fusion"
# fuse_fp8_linear:
# stage: "post_load_fusion"
# backend: "torch"
# fuse_nvfp4_linear:
# stage: "post_load_fusion"
# backend: "trtllm"
# Compilation configuration
compile:
backend: "torch-opt"
cuda_graph_batch_sizes: null