Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
e4db511
perf(trt): GPU-fuse face-restoration paste_back (39ms->2.4ms) + bench…
wangzijian1010 May 30, 2026
421c054
perf(trt): cache static mask + GPU-fuse face-restoration preprocess (…
wangzijian1010 May 30, 2026
7c122bb
docs(readme): add Benchmark section for the GPU-optimized face-restor…
wangzijian1010 May 30, 2026
965e37b
docs(readme): reframe Benchmark as a pipeline-wide GPU-optimization e…
wangzijian1010 May 30, 2026
deb9f94
docs(readme): make Benchmark a flat per-algorithm table
wangzijian1010 May 30, 2026
68c2cd5
Update README.md
DefTruth May 31, 2026
e07415c
refactor: drop MNN/NCNN/TNN backends, keep ORT (reference) + TRT (pro…
wangzijian1010 May 31, 2026
fadba05
docs(readme): reframe around extreme-GPU-inference + FaceFusion flagship
wangzijian1010 May 31, 2026
24f8efe
feat(facefusion): out-of-box CLI runner + quickstart for the flagship…
wangzijian1010 May 31, 2026
0014e40
fix(facefusion): fail fast with clear errors instead of segfaulting
wangzijian1010 May 31, 2026
e590a7b
feat(facefusion): per-stage pipeline profiling + whole-pipeline bench…
wangzijian1010 Jun 5, 2026
eca2d23
fix(trt): stop unbounded GPU/host memory growth in the facefusion pip…
wangzijian1010 Jun 5, 2026
159eac6
feat(facefusion): in-memory Mat-in/Mat-out pipeline API + compute-onl…
wangzijian1010 Jun 5, 2026
83c7acc
perf(trt): GPU-fuse the face-swap paste-back (swap 16.9 -> 9.4ms)
wangzijian1010 Jun 5, 2026
7603793
perf(trt): GPU-fuse the face-detect (yoloface) preprocess (detect 9.6…
wangzijian1010 Jun 7, 2026
15aa2ca
tools: mixed-precision GFPGAN FP16 engine builder for TRT 10.1
wangzijian1010 Jun 7, 2026
8acca60
fix(trt): isolate ROI in yoloface letterbox padding (BORDER_ISOLATED)
wangzijian1010 Jun 7, 2026
1090755
perf/cleanup(trt): hoist per-frame constants out of the face-swap hot…
wangzijian1010 Jun 7, 2026
5885953
feat(facefusion): split pipeline into prepare_source() + process() (s…
wangzijian1010 Jun 7, 2026
d535d8a
perf(trt): fold restoration postprocess RGB->BGR + uint8->float into …
wangzijian1010 Jun 7, 2026
75ec4de
perf(trt): GPU-resident NPP warp for face-restoration preprocess (dev…
wangzijian1010 Jun 7, 2026
c842356
perf(facefusion): ship mixed-precision GFPGAN by default (clean FP16,…
wangzijian1010 Jun 7, 2026
6af7f38
perf(trt): DeviceFrame brick 1 — restoration uploads input frame once…
wangzijian1010 Jun 7, 2026
dc44469
perf(trt): DeviceFrame brick 2 — fold restoration's blend_frame into …
wangzijian1010 Jun 7, 2026
f5d50d9
perf(trt): DeviceFrame brick 3 — weld the swap->restoration seam (no …
wangzijian1010 Jun 7, 2026
dc2a4a5
perf(trt): DeviceFrame brick 4 — restoration crop stays on device (po…
wangzijian1010 Jun 7, 2026
ab10242
perf(trt): DeviceFrame brick 5 — swap input device-resident via one s…
wangzijian1010 Jun 7, 2026
23c4932
Update README.md
DefTruth Jun 8, 2026
a9495d6
docs: reframe README/quickstart around the device-resident pipeline +…
wangzijian1010 Jun 14, 2026
2019bd6
Update README.md
DefTruth Jun 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ third_party
build/
lite.ai.toolkit.cmake
TestExamples

*.jpg
16 changes: 6 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,15 @@ add_definitions(-DSOURCE_PATH="${CMAKE_SOURCE_DIR}")

option(ENABLE_TEST "build test examples." OFF)
option(ENABLE_DEBUG_STRING "enable DEBUG string or not" ON)
option(ENABLE_ONNXRUNTIME "enable ONNXRuntime engine" ON)
option(ENABLE_TENSORRT "enable TensorRT engine" OFF)
option(ENABLE_MNN "enable MNN engine" OFF)
option(ENABLE_NCNN "enable NCNN engine" OFF)
option(ENABLE_TNN "enable TNN engine" OFF)
option(ENABLE_ONNXRUNTIME "enable ONNXRuntime engine (kept as numerical reference + test host)" ON)
option(ENABLE_TENSORRT "enable TensorRT engine (the maintained high-performance backend)" OFF)
option(ENABLE_ONNXRUNTIME_CUDA "enable ONNXRuntime engine with CUDA provider" OFF) # for future use
option(ENABLE_OPENCV_VIDEOIO "enable opencv videoio modules for detect_video apis" ON) # now, ON only

if ((NOT ENABLE_ONNXRUNTIME) AND (NOT ENABLE_MNN))
message(FATAL_ERROR "One of ONNXRuntime/MNN Backend must be enable!")
# As of >=0.3.2 the MNN/NCNN/TNN backends were dropped (frozen on tag `v0.2-all-backends`).
# ONNXRuntime is kept as the numerical-reference oracle and the only backend that can build tests.
if (NOT ENABLE_ONNXRUNTIME)
message(FATAL_ERROR "ONNXRuntime backend must be enabled (it hosts the test suite and numerical reference)!")
endif()

if ((NOT ENABLE_ONNXRUNTIME) AND ENABLE_TEST)
Expand Down Expand Up @@ -82,8 +81,5 @@ message(STATUS " Root Path: ${CMAKE_SOURCE_DIR}")
message(STATUS " OpenCV: ON Version: ${OpenCV_Version}")
message(STATUS " ONNXRUNTIME: ${ENABLE_ONNXRUNTIME} Version: ${OnnxRuntime_Version}")
message(STATUS " TENSORRT: ${ENABLE_TENSORRT} Version: ${TensorRT_Version}")
message(STATUS " MNN: ${ENABLE_MNN} Version: ${MNN_Version}")
message(STATUS " NCNN: ${ENABLE_NCNN} Version: ${NCNN_Version}")
message(STATUS " TNN: ${ENABLE_TNN} Version: ${TNN_Version}")
message(STATUS " INSTALL: ${CMAKE_INSTALL_PREFIX}")
message(STATUS "-------------------------- lite.ai.toolkit Configuration Summary --------------------------")
1,181 changes: 112 additions & 1,069 deletions README.md

Large diffs are not rendered by default.

53 changes: 53 additions & 0 deletions build_facefusion_engines.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env bash
# Build the 5 TensorRT engines the FaceFusion pipeline needs, from their ONNX files.
#
# Usage:
# bash ./build_facefusion_engines.sh <onnx_dir> <engine_dir>
#
# <onnx_dir> directory holding the 5 ONNX models (see docs/facefusion_quickstart.md)
# <engine_dir> where the .engine files are written (create if missing)
#
# Requires `trtexec` on PATH (ships with TensorRT 10.x). Override with TRTEXEC=...
set -euo pipefail

ONNX_DIR="${1:?usage: $0 <onnx_dir> <engine_dir>}"
ENGINE_DIR="${2:?usage: $0 <onnx_dir> <engine_dir>}"
TRTEXEC="${TRTEXEC:-trtexec}"
PYTHON="${PYTHON:-python3}"
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

mkdir -p "$ENGINE_DIR"

build() {
local onnx="$ONNX_DIR/$1" engine="$ENGINE_DIR/$2"; shift 2
if [[ ! -f "$onnx" ]]; then
echo "[build_facefusion_engines] MISSING onnx: $onnx" >&2; exit 1
fi
if [[ -f "$engine" ]]; then
echo "[build_facefusion_engines] skip (exists): $engine"; return
fi
echo "[build_facefusion_engines] $onnx -> $engine ($*)"
"$TRTEXEC" --onnx="$onnx" --saveEngine="$engine" "$@"
}

build yoloface_8n.onnx yoloface_8n_fp16.engine --fp16
build 2dfan4.onnx 2dfan4_fp16.engine --fp16
build arcface_w600k_r50.onnx arcface_w600k_r50_fp16.engine --fp16
build inswapper_128.onnx inswapper_128_fp16.engine --fp16

# GFPGAN: a naive --fp16 engine blows up its StyleGAN modulated convs (grey-block / a grey
# halo around the pasted-back face). The fix is mixed precision — FP16 everywhere except the
# style_conv/to_rgb layers, which stay FP32 (build_gfpgan_fp16_engine.py). That is numerically
# identical to the FP32 engine (PSNR ~58 dB) while cutting the restoration stage ~3 ms.
# Needs the TensorRT 10.x python wheel on $PYTHON; set GFPGAN_FP32=1 to fall back to plain FP32.
GFPGAN_ENGINE="$ENGINE_DIR/gfpgan_1.4_mixed.engine"
if [[ "${GFPGAN_FP32:-0}" == "1" ]]; then
build gfpgan_1.4.onnx gfpgan_1.4_fp32.engine
elif [[ -f "$GFPGAN_ENGINE" ]]; then
echo "[build_facefusion_engines] skip (exists): $GFPGAN_ENGINE"
else
echo "[build_facefusion_engines] gfpgan_1.4.onnx -> $GFPGAN_ENGINE (mixed fp16, style layers fp32)"
"$PYTHON" "$HERE/build_gfpgan_fp16_engine.py" "$ONNX_DIR/gfpgan_1.4.onnx" "$GFPGAN_ENGINE"
fi

echo "[build_facefusion_engines] done -> $ENGINE_DIR"
84 changes: 84 additions & 0 deletions build_gfpgan_fp16_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python
# Build a *mixed-precision* TensorRT engine for GFPGAN that the lite.ai.toolkit C++
# (TensorRT 10.1) can load — fast FP16 everywhere EXCEPT the StyleGAN modulated convs,
# which are kept in FP32.
#
# Why this exists:
# A naive `trtexec --fp16` GFPGAN engine produces grey-block artifacts: the StyleGAN
# "modulated conv" demodulation (sum-of-squares -> rsqrt) overflows/underflows in FP16.
# The clean fix is to keep just those layers (style_conv* / to_rgb*) in FP32 and run the
# rest in FP16. On TensorRT 10.1 the "strong typing via Cast nodes in the ONNX" route
# crashes (matchTypeSpec); the route that works is weak FP16 + OBEY_PRECISION_CONSTRAINTS
# with per-layer FP32 precision set through the builder API (this script).
#
# Result on RTX 4090 / TRT 10.1: restoration infer 10.8 -> 8.0 ms, output numerically clean
# (no grey blocks); facefusion pipeline 36.6 -> 33.1 ms (27 -> 30 FPS).
#
# Requirements: the TensorRT 10.1 *python* wheel (ships in the TRT tarball under python/),
# e.g. python -m venv env && env/bin/pip install /usr/local/tensorrt/python/tensorrt-10.1.0-cp312-*.whl
#
# Usage:
# LD_LIBRARY_PATH=/usr/local/tensorrt/lib:/usr/local/cuda/lib64 \
# python build_gfpgan_fp16_engine.py <gfpgan.onnx> <out.engine>
#
import sys
import os
import tensorrt as trt

# Substring match on layer names; these are the StyleGAN modulated convs that must stay FP32.
KEEP_FP32 = ("style_conv", "to_rgb")
FLOAT_TYPES = (trt.float32, trt.float16)


def main():
if len(sys.argv) < 3:
print(__doc__)
sys.exit(1)
onnx_path, engine_path = sys.argv[1], sys.argv[2]

log = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(log)
network = builder.create_network(0)
parser = trt.OnnxParser(network, log)
with open(onnx_path, "rb") as f:
if not parser.parse(f.read()):
for i in range(parser.num_errors):
print(parser.get_error(i))
sys.exit(1)

cfg = builder.create_builder_config()
cfg.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30)
cfg.set_flag(trt.BuilderFlag.FP16)
cfg.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)

def all_float(layer):
return layer.num_outputs > 0 and all(
layer.get_output(j).dtype in FLOAT_TYPES for j in range(layer.num_outputs)
)

pinned = 0
for i in range(network.num_layers):
layer = network.get_layer(i)
if not any(k in layer.name for k in KEEP_FP32):
continue
# Only float compute layers; skip Int64/shape Constants (can't be FP32-typed).
if layer.type == trt.LayerType.CONSTANT or not all_float(layer):
continue
layer.precision = trt.float32
for j in range(layer.num_outputs):
layer.set_output_type(j, trt.float32)
pinned += 1
print(f"network layers={network.num_layers} pinned to fp32={pinned}", flush=True)

print("building serialized engine (this is slow on the first build)...", flush=True)
serialized = builder.build_serialized_network(network, cfg)
if serialized is None:
print("BUILD FAILED")
sys.exit(1)
with open(engine_path, "wb") as f:
f.write(serialized)
print(f"OK wrote {engine_path} ({os.path.getsize(engine_path) / 1e6:.1f} MB)")


if __name__ == "__main__":
main()
38 changes: 0 additions & 38 deletions cmake/MNN.cmake

This file was deleted.

27 changes: 0 additions & 27 deletions cmake/TNN.cmake

This file was deleted.

34 changes: 2 additions & 32 deletions cmake/lite.ai.toolkit.cmake.in
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
CMAKE_MINIMUM_REQUIRED(VERSION 3.8)

set(ENABLE_ONNXRUNTIME @ENABLE_ONNXRUNTIME@)
set(ENABLE_MNN @ENABLE_MNN@)
set(ENABLE_NCNN @ENABLE_NCNN@)
set(ENABLE_TNN @ENABLE_TNN@)
set(ENABLE_TENSORRT @ENABLE_TENSORRT@)
set(CUDA_DIR @CUDA_DIR@)
set(TensorRT_DIR @TensorRT_DIR@)
Expand All @@ -13,8 +10,8 @@ if (NOT (UNIX AND NOT APPLE))
message(FATAL_ERROR "lite.ai.toolkit>=0.2 not support for windows/mac now!")
endif()

if ((NOT ENABLE_ONNXRUNTIME) AND (NOT ENABLE_MNN))
message(FATAL_ERROR "One of ONNXRuntime/MNN Backend must be enable!")
if (NOT ENABLE_ONNXRUNTIME)
message(FATAL_ERROR "ONNXRuntime backend must be enabled!")
endif()

# lite.ai.toolkit
Expand Down Expand Up @@ -46,33 +43,6 @@ if (ENABLE_ONNXRUNTIME)
list(APPEND Lite_AI_LIBS onnxruntime)
endif()

# MNN
if (ENABLE_MNN)
include_directories(${THIRD_PARTY_PATH}/MNN/include)
link_directories(${THIRD_PARTY_PATH}/MNN/lib)
list(APPEND Lite_AI_INCLUDE_DIRS ${THIRD_PARTY_PATH}/MNN/include)
list(APPEND Lite_AI_LIBS_DIRS ${THIRD_PARTY_PATH}/MNN/lib)
list(APPEND Lite_AI_LIBS MNN)
endif()

# TNN
if (ENABLE_TNN)
include_directories(${THIRD_PARTY_PATH}/TNN/include)
link_directories(${THIRD_PARTY_PATH}/TNN/lib)
list(APPEND Lite_AI_INCLUDE_DIRS ${THIRD_PARTY_PATH}/TNN/include)
list(APPEND Lite_AI_LIBS_DIRS ${THIRD_PARTY_PATH}/TNN/lib)
list(APPEND Lite_AI_LIBS TNN)
endif()

# ncnn
if (ENABLE_NCNN)
include_directories(${THIRD_PARTY_PATH}/ncnn/include)
link_directories(${THIRD_PARTY_PATH}/ncnn/lib)
list(APPEND Lite_AI_INCLUDE_DIRS ${THIRD_PARTY_PATH}/ncnn/include)
list(APPEND Lite_AI_LIBS_DIRS ${THIRD_PARTY_PATH}/ncnn/lib)
list(APPEND Lite_AI_LIBS ncnn)
endif()

# tensorrt
if (ENABLE_TENSORRT)
include_directories(${TensorRT_DIR}/include)
Expand Down
27 changes: 0 additions & 27 deletions cmake/ncnn.cmake

This file was deleted.

21 changes: 2 additions & 19 deletions cmake/utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -53,28 +53,11 @@ function(add_lite_ai_toolkit_shared_library version soversion)
include(cmake/tensorrt.cmake)
set(LITE_SRCS ${LITE_SRCS} ${TRT_SRCS})
set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} cuda cudart nvinfer nvonnxparser
nvinfer_plugin ddim_scheduler_cpp)
nvinfer_plugin ddim_scheduler_cpp
nppc nppig nppidei) # NPP: GPU warp/resize
link_directories(${CMAKE_SOURCE_DIR}/lite/bin)
endif ()

if (ENABLE_MNN)
include(cmake/MNN.cmake)
set(LITE_SRCS ${LITE_SRCS} ${MNN_SRCS})
set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} MNN)
endif ()

if (ENABLE_NCNN)
include(cmake/ncnn.cmake)
set(LITE_SRCS ${LITE_SRCS} ${NCNN_SRCS})
set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} ncnn)
endif ()

if (ENABLE_TNN)
include(cmake/TNN.cmake)
set(LITE_SRCS ${LITE_SRCS} ${TNN_SRCS})
set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} TNN)
endif ()

# 4. shared library
add_library(lite.ai.toolkit SHARED ${LITE_SRCS})
target_link_libraries(lite.ai.toolkit ${LITE_DEPENDENCIES})
Expand Down
Loading