xlite-dev · wangzijian1010 · May 30, 2026 · May 30, 2026 · May 30, 2026 · May 30, 2026
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,4 @@ third_party
 build/
 lite.ai.toolkit.cmake
 TestExamples
-
+*.jpg
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -31,16 +31,15 @@ add_definitions(-DSOURCE_PATH="${CMAKE_SOURCE_DIR}")
 
 option(ENABLE_TEST "build test examples." OFF)
 option(ENABLE_DEBUG_STRING "enable DEBUG string or not" ON)
-option(ENABLE_ONNXRUNTIME "enable ONNXRuntime engine" ON)
-option(ENABLE_TENSORRT "enable TensorRT engine" OFF)
-option(ENABLE_MNN "enable MNN engine" OFF)
-option(ENABLE_NCNN "enable NCNN engine" OFF) 
-option(ENABLE_TNN "enable TNN engine" OFF)
+option(ENABLE_ONNXRUNTIME "enable ONNXRuntime engine (kept as numerical reference + test host)" ON)
+option(ENABLE_TENSORRT "enable TensorRT engine (the maintained high-performance backend)" OFF)
 option(ENABLE_ONNXRUNTIME_CUDA "enable ONNXRuntime engine with CUDA provider" OFF) # for future use
 option(ENABLE_OPENCV_VIDEOIO "enable opencv videoio modules for detect_video apis" ON) # now, ON only
 
-if ((NOT ENABLE_ONNXRUNTIME) AND (NOT ENABLE_MNN))
-    message(FATAL_ERROR "One of ONNXRuntime/MNN Backend must be enable!")
+# As of >=0.3.2 the MNN/NCNN/TNN backends were dropped (frozen on tag `v0.2-all-backends`).
+# ONNXRuntime is kept as the numerical-reference oracle and the only backend that can build tests.
+if (NOT ENABLE_ONNXRUNTIME)
+    message(FATAL_ERROR "ONNXRuntime backend must be enabled (it hosts the test suite and numerical reference)!")
 endif()
 
 if ((NOT ENABLE_ONNXRUNTIME) AND ENABLE_TEST)
@@ -82,8 +81,5 @@ message(STATUS "      Root Path: ${CMAKE_SOURCE_DIR}")
 message(STATUS "         OpenCV: ON Version: ${OpenCV_Version}")
 message(STATUS "    ONNXRUNTIME: ${ENABLE_ONNXRUNTIME} Version: ${OnnxRuntime_Version}")
 message(STATUS "       TENSORRT: ${ENABLE_TENSORRT} Version: ${TensorRT_Version}")
-message(STATUS "            MNN: ${ENABLE_MNN} Version: ${MNN_Version}")
-message(STATUS "           NCNN: ${ENABLE_NCNN} Version: ${NCNN_Version}")
-message(STATUS "            TNN: ${ENABLE_TNN} Version: ${TNN_Version}")
 message(STATUS "        INSTALL:  ${CMAKE_INSTALL_PREFIX}")
 message(STATUS "-------------------------- lite.ai.toolkit Configuration Summary --------------------------")
diff --git a/README.md b/README.md
diff --git a/build_facefusion_engines.sh b/build_facefusion_engines.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Build the 5 TensorRT engines the FaceFusion pipeline needs, from their ONNX files.
+#
+# Usage:
+#   bash ./build_facefusion_engines.sh <onnx_dir> <engine_dir>
+#
+# <onnx_dir>   directory holding the 5 ONNX models (see docs/facefusion_quickstart.md)
+# <engine_dir> where the .engine files are written (create if missing)
+#
+# Requires `trtexec` on PATH (ships with TensorRT 10.x). Override with TRTEXEC=...
+set -euo pipefail
+
+ONNX_DIR="${1:?usage: $0 <onnx_dir> <engine_dir>}"
+ENGINE_DIR="${2:?usage: $0 <onnx_dir> <engine_dir>}"
+TRTEXEC="${TRTEXEC:-trtexec}"
+PYTHON="${PYTHON:-python3}"
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+mkdir -p "$ENGINE_DIR"
+
+build() {
+  local onnx="$ONNX_DIR/$1" engine="$ENGINE_DIR/$2"; shift 2
+  if [[ ! -f "$onnx" ]]; then
+    echo "[build_facefusion_engines] MISSING onnx: $onnx" >&2; exit 1
+  fi
+  if [[ -f "$engine" ]]; then
+    echo "[build_facefusion_engines] skip (exists): $engine"; return
+  fi
+  echo "[build_facefusion_engines] $onnx -> $engine  ($*)"
+  "$TRTEXEC" --onnx="$onnx" --saveEngine="$engine" "$@"
+}
+
+build yoloface_8n.onnx          yoloface_8n_fp16.engine          --fp16
+build 2dfan4.onnx               2dfan4_fp16.engine               --fp16
+build arcface_w600k_r50.onnx    arcface_w600k_r50_fp16.engine    --fp16
+build inswapper_128.onnx        inswapper_128_fp16.engine        --fp16
+
+# GFPGAN: a naive --fp16 engine blows up its StyleGAN modulated convs (grey-block / a grey
+# halo around the pasted-back face). The fix is mixed precision — FP16 everywhere except the
+# style_conv/to_rgb layers, which stay FP32 (build_gfpgan_fp16_engine.py). That is numerically
+# identical to the FP32 engine (PSNR ~58 dB) while cutting the restoration stage ~3 ms.
+# Needs the TensorRT 10.x python wheel on $PYTHON; set GFPGAN_FP32=1 to fall back to plain FP32.
+GFPGAN_ENGINE="$ENGINE_DIR/gfpgan_1.4_mixed.engine"
+if [[ "${GFPGAN_FP32:-0}" == "1" ]]; then
+  build gfpgan_1.4.onnx         gfpgan_1.4_fp32.engine
+elif [[ -f "$GFPGAN_ENGINE" ]]; then
+  echo "[build_facefusion_engines] skip (exists): $GFPGAN_ENGINE"
+else
+  echo "[build_facefusion_engines] gfpgan_1.4.onnx -> $GFPGAN_ENGINE  (mixed fp16, style layers fp32)"
+  "$PYTHON" "$HERE/build_gfpgan_fp16_engine.py" "$ONNX_DIR/gfpgan_1.4.onnx" "$GFPGAN_ENGINE"
+fi
+
+echo "[build_facefusion_engines] done -> $ENGINE_DIR"
diff --git a/build_gfpgan_fp16_engine.py b/build_gfpgan_fp16_engine.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# Build a *mixed-precision* TensorRT engine for GFPGAN that the lite.ai.toolkit C++
+# (TensorRT 10.1) can load — fast FP16 everywhere EXCEPT the StyleGAN modulated convs,
+# which are kept in FP32.
+#
+# Why this exists:
+#   A naive `trtexec --fp16` GFPGAN engine produces grey-block artifacts: the StyleGAN
+#   "modulated conv" demodulation (sum-of-squares -> rsqrt) overflows/underflows in FP16.
+#   The clean fix is to keep just those layers (style_conv* / to_rgb*) in FP32 and run the
+#   rest in FP16. On TensorRT 10.1 the "strong typing via Cast nodes in the ONNX" route
+#   crashes (matchTypeSpec); the route that works is weak FP16 + OBEY_PRECISION_CONSTRAINTS
+#   with per-layer FP32 precision set through the builder API (this script).
+#
+# Result on RTX 4090 / TRT 10.1: restoration infer 10.8 -> 8.0 ms, output numerically clean
+# (no grey blocks); facefusion pipeline 36.6 -> 33.1 ms (27 -> 30 FPS).
+#
+# Requirements: the TensorRT 10.1 *python* wheel (ships in the TRT tarball under python/),
+# e.g.  python -m venv env && env/bin/pip install /usr/local/tensorrt/python/tensorrt-10.1.0-cp312-*.whl
+#
+# Usage:
+#   LD_LIBRARY_PATH=/usr/local/tensorrt/lib:/usr/local/cuda/lib64 \
+#     python build_gfpgan_fp16_engine.py <gfpgan.onnx> <out.engine>
+#
+import sys
+import os
+import tensorrt as trt
+
+# Substring match on layer names; these are the StyleGAN modulated convs that must stay FP32.
+KEEP_FP32 = ("style_conv", "to_rgb")
+FLOAT_TYPES = (trt.float32, trt.float16)
+
+
+def main():
+    if len(sys.argv) < 3:
+        print(__doc__)
+        sys.exit(1)
+    onnx_path, engine_path = sys.argv[1], sys.argv[2]
+
+    log = trt.Logger(trt.Logger.WARNING)
+    builder = trt.Builder(log)
+    network = builder.create_network(0)
+    parser = trt.OnnxParser(network, log)
+    with open(onnx_path, "rb") as f:
+        if not parser.parse(f.read()):
+            for i in range(parser.num_errors):
+                print(parser.get_error(i))
+            sys.exit(1)
+
+    cfg = builder.create_builder_config()
+    cfg.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30)
+    cfg.set_flag(trt.BuilderFlag.FP16)
+    cfg.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)
+
+    def all_float(layer):
+        return layer.num_outputs > 0 and all(
+            layer.get_output(j).dtype in FLOAT_TYPES for j in range(layer.num_outputs)
+        )
+
+    pinned = 0
+    for i in range(network.num_layers):
+        layer = network.get_layer(i)
+        if not any(k in layer.name for k in KEEP_FP32):
+            continue
+        # Only float compute layers; skip Int64/shape Constants (can't be FP32-typed).
+        if layer.type == trt.LayerType.CONSTANT or not all_float(layer):
+            continue
+        layer.precision = trt.float32
+        for j in range(layer.num_outputs):
+            layer.set_output_type(j, trt.float32)
+        pinned += 1
+    print(f"network layers={network.num_layers}  pinned to fp32={pinned}", flush=True)
+
+    print("building serialized engine (this is slow on the first build)...", flush=True)
+    serialized = builder.build_serialized_network(network, cfg)
+    if serialized is None:
+        print("BUILD FAILED")
+        sys.exit(1)
+    with open(engine_path, "wb") as f:
+        f.write(serialized)
+    print(f"OK wrote {engine_path} ({os.path.getsize(engine_path) / 1e6:.1f} MB)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cmake/MNN.cmake b/cmake/MNN.cmake
diff --git a/cmake/TNN.cmake b/cmake/TNN.cmake
diff --git a/cmake/lite.ai.toolkit.cmake.in b/cmake/lite.ai.toolkit.cmake.in
@@ -1,9 +1,6 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 3.8)
 
 set(ENABLE_ONNXRUNTIME @ENABLE_ONNXRUNTIME@)
-set(ENABLE_MNN @ENABLE_MNN@)
-set(ENABLE_NCNN @ENABLE_NCNN@)
-set(ENABLE_TNN @ENABLE_TNN@)
 set(ENABLE_TENSORRT @ENABLE_TENSORRT@)
 set(CUDA_DIR @CUDA_DIR@)
 set(TensorRT_DIR @TensorRT_DIR@)
@@ -13,8 +10,8 @@ if (NOT (UNIX AND NOT APPLE))
   message(FATAL_ERROR "lite.ai.toolkit>=0.2 not support for windows/mac now!")
 endif()
 
-if ((NOT ENABLE_ONNXRUNTIME) AND (NOT ENABLE_MNN))
-    message(FATAL_ERROR "One of ONNXRuntime/MNN Backend must be enable!")
+if (NOT ENABLE_ONNXRUNTIME)
+    message(FATAL_ERROR "ONNXRuntime backend must be enabled!")
 endif()
 
 # lite.ai.toolkit 
@@ -46,33 +43,6 @@ if (ENABLE_ONNXRUNTIME)
   list(APPEND Lite_AI_LIBS onnxruntime)
 endif()
 
-# MNN
-if (ENABLE_MNN)
-  include_directories(${THIRD_PARTY_PATH}/MNN/include)
-  link_directories(${THIRD_PARTY_PATH}/MNN/lib)
-  list(APPEND Lite_AI_INCLUDE_DIRS ${THIRD_PARTY_PATH}/MNN/include)
-  list(APPEND Lite_AI_LIBS_DIRS ${THIRD_PARTY_PATH}/MNN/lib)
-  list(APPEND Lite_AI_LIBS MNN)
-endif()
-
-# TNN
-if (ENABLE_TNN)
-  include_directories(${THIRD_PARTY_PATH}/TNN/include)
-  link_directories(${THIRD_PARTY_PATH}/TNN/lib)
-  list(APPEND Lite_AI_INCLUDE_DIRS ${THIRD_PARTY_PATH}/TNN/include)
-  list(APPEND Lite_AI_LIBS_DIRS ${THIRD_PARTY_PATH}/TNN/lib)
-  list(APPEND Lite_AI_LIBS TNN)
-endif()
-
-# ncnn
-if (ENABLE_NCNN)
-  include_directories(${THIRD_PARTY_PATH}/ncnn/include)
-  link_directories(${THIRD_PARTY_PATH}/ncnn/lib)
-  list(APPEND Lite_AI_INCLUDE_DIRS ${THIRD_PARTY_PATH}/ncnn/include)
-  list(APPEND Lite_AI_LIBS_DIRS ${THIRD_PARTY_PATH}/ncnn/lib)
-  list(APPEND Lite_AI_LIBS ncnn)
-endif()
-
 # tensorrt
 if (ENABLE_TENSORRT)
   include_directories(${TensorRT_DIR}/include)

diff --git a/cmake/ncnn.cmake b/cmake/ncnn.cmake
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
@@ -53,28 +53,11 @@ function(add_lite_ai_toolkit_shared_library version soversion)
         include(cmake/tensorrt.cmake)
         set(LITE_SRCS ${LITE_SRCS} ${TRT_SRCS})
         set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} cuda cudart nvinfer nvonnxparser
-                                                   nvinfer_plugin ddim_scheduler_cpp)
+                                                   nvinfer_plugin ddim_scheduler_cpp
+                                                   nppc nppig nppidei)  # NPP: GPU warp/resize
         link_directories(${CMAKE_SOURCE_DIR}/lite/bin)
     endif ()
 
-    if (ENABLE_MNN)
-        include(cmake/MNN.cmake)
-        set(LITE_SRCS ${LITE_SRCS} ${MNN_SRCS})
-        set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} MNN)
-    endif ()
-
-    if (ENABLE_NCNN)
-        include(cmake/ncnn.cmake)
-        set(LITE_SRCS ${LITE_SRCS} ${NCNN_SRCS})
-        set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} ncnn)
-    endif ()
-
-    if (ENABLE_TNN)
-        include(cmake/TNN.cmake)
-        set(LITE_SRCS ${LITE_SRCS} ${TNN_SRCS})
-        set(LITE_DEPENDENCIES ${LITE_DEPENDENCIES} TNN)
-    endif ()
-
     # 4. shared library
     add_library(lite.ai.toolkit SHARED ${LITE_SRCS})
     target_link_libraries(lite.ai.toolkit ${LITE_DEPENDENCIES})
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,4 +13,4 @@ third_party @@
     build/
     lite.ai.toolkit.cmake
     TestExamples
+    *.jpg