[TRT] Support YOLO 11 (#468)

wangzijian1010 · web-flow · commit e53c332eb862 · 2025-12-12T21:02:52.000+08:00
diff --git a/examples/lite/CMakeLists.txt b/examples/lite/CMakeLists.txt
@@ -110,4 +110,5 @@ add_lite_executable(lite_face_swap cv)
 add_lite_executable(lite_face_restoration cv)
 add_lite_executable(lite_facefusion_pipeline cv)
 add_lite_executable(lite_yolov8 cv)
+add_lite_executable(lite_yolov11 cv)
 add_lite_executable(lite_sd_pipeline sd)
diff --git a/examples/lite/cv/test_lite_yolov11.cpp b/examples/lite/cv/test_lite_yolov11.cpp
@@ -0,0 +1,43 @@
+//
+// Created by wangizijian on 24-7-8.
+//
+
+#include "lite/lite.h"
+
+
+
+static void test_tensorrt()
+{
+#ifdef ENABLE_TENSORRT
+    std::string engine_path = "/root/autodl-tmp/lite.ai.toolkit/examples/hub/onnx/cv/yolo11_fp32.engine";
+    std::string test_img_path = "/root/autodl-tmp/lite.ai.toolkit/examples/lite/resources/test_lite_yolov5_2.jpg";
+    std::string save_img_path = "/root/autodl-tmp/lite.ai.toolkit/examples/test_lite_yolov8_trt_1.jpg";
+
+    lite::trt::cv::detection::YOLOV11 *yolov11  = new lite::trt::cv::detection::YOLOV11(engine_path);
+
+    cv::Mat test_image = cv::imread(test_img_path);
+
+    std::vector<lite::types::Boxf> detected_boxes;
+
+    yolov11->detect(test_image,detected_boxes,0.5f,0.4f);
+
+    std::cout<<"trt yolov8 detect done!"<<std::endl;
+    lite::utils::draw_boxes_inplace(test_image, detected_boxes);
+    cv::imwrite(save_img_path, test_image);
+
+    delete yolov11;
+#endif
+}
+
+static void test_lite()
+{
+    test_tensorrt();
+}
+
+
+
+int main(__unused int argc, __unused char *argv[])
+{
+    test_lite();
+    return 0;
+}
diff --git a/lite/models.h b/lite/models.h
@@ -135,6 +135,7 @@
 #include "lite/trt/core/trt_core.h"
 #include "lite/trt/cv/trt_yolofacev8.h"
 #include "lite/trt/cv/trt_yolov5.h"
+#include "lite/trt/cv/trt_yolov11.h"
 #include "lite/trt/cv/trt_yolox.h"
 #include "lite/trt/cv/trt_yolov8.h"
 #include "lite/trt/cv/trt_yolov6.h"
@@ -764,6 +765,7 @@ namespace lite{
             typedef trtcv::TRTYoloFaceV8 _TRT_YOLOFaceNet;
             typedef trtcv::TRTYoloV5 _TRT_YOLOv5;
             typedef trtcv::TRTYoloV8 _TRT_YOLOv8;
+            typedef trtcv::TRTYOLOV11 _TRT_YOLOV11;
             typedef trtcv::TRTYoloX _TRT_YoloX;
             typedef trtcv::TRTYoloV6 _TRT_YOLOv6;
             typedef trtcv::TRTYOLO5Face _TRT_YOLO5Face;
@@ -789,6 +791,7 @@ namespace lite{
                 typedef _TRT_YOLOv8 YOLOV8;
                 typedef _TRT_YoloX YoloX;
                 typedef _TRT_YOLOv6 YOLOV6;
+                typedef _TRT_YOLOV11 YOLOV11;
             }
             namespace face
             {
diff --git a/lite/trt/core/trt_core.h b/lite/trt/core/trt_core.h
@@ -16,6 +16,7 @@ namespace trtcv{
     class LITE_EXPORTS TRTYoloV8;     // [4] * reference: https://github.com/ultralytics/ultralytics/tree/main
     class LITE_EXPORTS TRTYoloV6;     // [5] * reference: https://github.com/meituan/YOLOv6
     class LITE_EXPORTS TRTYOLO5Face;     // [6] * reference: https://github.com/deepcam-cn/yolov5-face
+    class LITE_EXPORTS TRTYOLOV11;
 }
 
 namespace trtcv{
diff --git a/lite/trt/cv/trt_yolov11.cpp b/lite/trt/cv/trt_yolov11.cpp
@@ -0,0 +1,199 @@
+//
+// Created by wangzijian.
+//
+
+#include "trt_yolov11.h"
+using trtcv::TRTYOLOV11;
+
+void TRTYOLOV11::nms(std::vector<types::Boxf> &input, std::vector<types::Boxf> &output,
+                    float iou_threshold, unsigned int topk, unsigned int nms_type)
+{
+    if (nms_type == NMS::BLEND) lite::utils::blending_nms(input, output, iou_threshold, topk);
+    else if (nms_type == NMS::OFFSET) lite::utils::offset_nms(input, output, iou_threshold, topk);
+    else lite::utils::hard_nms(input, output, iou_threshold, topk);
+}
+
+void TRTYOLOV11::generate_bboxes(std::vector<types::Boxf> &bbox_collection, float* output, float score_threshold,
+                                float scale, float pad_w, float pad_h) {
+    auto pred_dims = output_node_dims[0]; // [1, 84, 8400]
+    const unsigned int num_anchors = pred_dims[2];
+    const unsigned int num_classes = pred_dims[1] - 4;
+
+    bbox_collection.clear();
+    unsigned int count = 0;
+
+    for (unsigned int i = 0; i < num_anchors; ++i) {
+        float max_cls_conf = -1.f;
+        unsigned int label = 0;
+
+        // 寻找最大类别分数
+        for (unsigned int j = 0; j < num_classes; ++j) {
+            float cls_score = output[(4 + j) * num_anchors + i];
+            if (cls_score > max_cls_conf) {
+                max_cls_conf = cls_score;
+                label = j;
+            }
+        }
+
+        if (max_cls_conf < score_threshold) continue;
+
+        float cx = output[0 * num_anchors + i];
+        float cy = output[1 * num_anchors + i];
+        float w = output[2 * num_anchors + i];
+        float h = output[3 * num_anchors + i];
+
+        float x1_net = cx - w / 2.f;
+        float y1_net = cy - h / 2.f;
+
+        float x1 = (x1_net - pad_w) / scale;
+        float y1 = (y1_net - pad_h) / scale;
+        float w_original = w / scale;
+        float h_original = h / scale;
+
+        float x2 = x1 + w_original;
+        float y2 = y1 + h_original;
+
+        types::Boxf box;
+        box.x1 = std::max(0.f, x1);
+        box.y1 = std::max(0.f, y1);
+        box.x2 = x2;
+        box.y2 = y2;
+        box.score = max_cls_conf;
+        box.label = label;
+        box.label_text = class_names[label];
+        box.flag = true;
+        bbox_collection.push_back(box);
+
+        count += 1;
+        if (count > max_nms)
+            break;
+    }
+
+#if LITETRT_DEBUG
+    std::cout << "detected num_anchors: " << num_anchors << "\n";
+    std::cout << "generate_bboxes num: " << bbox_collection.size() << "\n";
+#endif
+}
+
+void TRTYOLOV11::letterbox(const cv::Mat &image, cv::Mat &out_image,
+                           const cv::Size &new_shape,
+                           int stride, const cv::Scalar &color,
+                           bool fixed_shape, bool scale_up) {
+    cv::Size shape = image.size();
+    float r = std::min((float)new_shape.height / (float)shape.height,
+                       (float)new_shape.width / (float)shape.width);
+    if (!scale_up) {
+        r = std::min(r, 1.0f);
+    }
+
+    int new_unpad_w = int(round(shape.width * r));
+    int new_unpad_h = int(round(shape.height * r));
+    int dw = new_shape.width - new_unpad_w;
+    int dh = new_shape.height - new_unpad_h;
+
+    if (fixed_shape) {
+        dw = dw % stride;
+        dh = dh % stride;
+    }
+
+    dw /= 2;
+    dh /= 2;
+
+    if (shape.width != new_unpad_w || shape.height != new_unpad_h) {
+        cv::resize(image, out_image, cv::Size(new_unpad_w, new_unpad_h));
+    } else {
+        out_image = image;
+    }
+
+    int top = int(round(dh - 0.1));
+    int bottom = int(round(dh + 0.1));
+    int left = int(round(dw - 0.1));
+    int right = int(round(dw + 0.1));
+
+    cv::copyMakeBorder(out_image, out_image, top, bottom, left, right, cv::BORDER_CONSTANT, color);
+
+    if (out_image.size() != new_shape) {
+        cv::resize(out_image, out_image, new_shape);
+    }
+}
+
+void TRTYOLOV11::preprocess(cv::Mat &input_image) {
+    // 1. Convert BGR -> RGB
+    cv::cvtColor(input_image, input_image, cv::COLOR_BGR2RGB);
+    // 2. Normalize (0-255 -> 0.0-1.0)
+    input_image.convertTo(input_image, CV_32F, scale_val, mean_val);
+}
+
+// main func
+void TRTYOLOV11::detect(const cv::Mat &mat, std::vector<types::Boxf> &detected_boxes, float score_threshold,
+                       float iou_threshold, unsigned int topk, unsigned int nms_type) {
+
+    if (mat.empty()) return;
+
+
+    int target_h = input_node_dims[2];
+    int target_w = input_node_dims[3];
+    int img_h = mat.rows;
+    int img_w = mat.cols;
+
+
+    float r = std::min((float)target_h / img_h, (float)target_w / img_w);
+    int new_unpad_w = int(round(img_w * r));
+    int new_unpad_h = int(round(img_h * r));
+
+    int dw = (target_w - new_unpad_w) / 2;
+    int dh = (target_h - new_unpad_h) / 2;
+
+    cv::Mat mat_rs;
+    if (img_h != new_unpad_h || img_w != new_unpad_w) {
+        cv::resize(mat, mat_rs, cv::Size(new_unpad_w, new_unpad_h));
+    } else {
+        mat_rs = mat.clone();
+    }
+
+    int top = dh;
+    int bottom = target_h - new_unpad_h - top;
+    int left = dw;
+    int right = target_w - new_unpad_w - left;
+
+    cv::copyMakeBorder(mat_rs, mat_rs, top, bottom, left, right, cv::BORDER_CONSTANT, cv::Scalar(114, 114, 114));
+    // -------------------------------
+
+    preprocess(mat_rs);
+
+    // 1. Make the input (HWC -> CHW)
+    std::vector<float> input;
+    trtcv::utils::transform::create_tensor(mat_rs, input, input_node_dims, trtcv::utils::transform::CHW);
+
+    // 2. Inference
+    cudaMemcpyAsync(buffers[0], input.data(),
+                    input_node_dims[0] * input_node_dims[1] * input_node_dims[2] * input_node_dims[3] * sizeof(float),
+                    cudaMemcpyHostToDevice, stream);
+
+    cudaStreamSynchronize(stream);
+
+    bool status = trt_context->enqueueV3(stream); // TensorRT 8.5+ usage
+    if (!status){
+         std::cerr << "Failed to infer by TensorRT." << std::endl;
+         return;
+    }
+
+    cudaStreamSynchronize(stream);
+
+    // D -> H
+    auto pred_dims = output_node_dims[0];
+    size_t output_size = pred_dims[0] * pred_dims[1] * pred_dims[2];
+    std::vector<float> output(output_size);
+
+    cudaMemcpyAsync(output.data(), buffers[1], output_size * sizeof(float),
+                    cudaMemcpyDeviceToHost, stream);
+    cudaStreamSynchronize(stream);
+
+    // 3. postprocess
+    std::vector<types::Boxf> bbox_collection;
+
+    // restore letterbox
+    generate_bboxes(bbox_collection, output.data(), score_threshold, r, (float)left, (float)top);
+
+    nms(bbox_collection, detected_boxes, iou_threshold, topk, nms_type);
+}
diff --git a/lite/trt/cv/trt_yolov11.h b/lite/trt/cv/trt_yolov11.h
@@ -0,0 +1,64 @@
+//
+// Created by wangzijian.
+//
+
+#ifndef LITE_AI_TOOLKIT_TRT_YOLOV11_H
+#define LITE_AI_TOOLKIT_TRT_YOLOV11_H
+
+#include "lite/trt/core/trt_core.h"
+#include "lite/utils.h"
+#include "lite/trt/core/trt_utils.h"
+#include <algorithm>
+
+namespace trtcv {
+    class LITE_EXPORTS TRTYOLOV11 : public BasicTRTHandler {
+    public:
+        explicit TRTYOLOV11(const std::string &_trt_model_path, unsigned int _num_threads = 1) :
+                BasicTRTHandler(_trt_model_path, _num_threads) {};
+
+        ~TRTYOLOV11() override = default;
+
+    private:
+        static constexpr const float mean_val = 0.f;
+        static constexpr const float scale_val = 1.0 / 255.f;
+        const char *class_names[80] = {
+                "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+                "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+                "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+                "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+                "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+                "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+                "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
+                "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
+                "scissors", "teddy bear", "hair drier", "toothbrush"
+        };
+        enum NMS {
+            HARD = 0, BLEND = 1, OFFSET = 2
+        };
+        static constexpr const unsigned int max_nms = 30000;
+
+    private:
+        void letterbox(const cv::Mat &image, cv::Mat &out_image,
+                       const cv::Size &new_shape,
+                       int stride, const cv::Scalar &color,
+                       bool fixed_shape, bool scale_up);
+
+        void preprocess(cv::Mat &input_image);
+
+        void generate_bboxes(std::vector<types::Boxf> &bbox_collection,
+                             float *output,
+                             float score_threshold,
+                             float scale, float pad_w, float pad_h);
+
+        void nms(std::vector<types::Boxf> &input, std::vector<types::Boxf> &output,
+                 float iou_threshold, unsigned int topk, unsigned int nms_type);
+
+    public:
+        void detect(const cv::Mat &mat, std::vector<types::Boxf> &detected_boxes,
+                    float score_threshold = 0.25f, float iou_threshold = 0.45f,
+                    unsigned int topk = 100, unsigned int nms_type = NMS::OFFSET);
+    };
+
+} // namespace trtcv
+
+#endif //LITE_AI_TOOLKIT_TRT_YOLOV11_H

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,7 @@ namespace trtcv{`
`16`	`16`	`class LITE_EXPORTS TRTYoloV8; // [4] * reference: https://github.com/ultralytics/ultralytics/tree/main`
`17`	`17`	`class LITE_EXPORTS TRTYoloV6; // [5] * reference: https://github.com/meituan/YOLOv6`
`18`	`18`	`class LITE_EXPORTS TRTYOLO5Face; // [6] * reference: https://github.com/deepcam-cn/yolov5-face`
	`19`	`+ class LITE_EXPORTS TRTYOLOV11;`
`19`	`20`	`}`
`20`	`21`
`21`	`22`	`namespace trtcv{`