-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsam3.h
More file actions
70 lines (64 loc) · 2.84 KB
/
sam3.h
File metadata and controls
70 lines (64 loc) · 2.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#ifndef SAM3_CPP_H_
#define SAM3_CPP_H_
#include <onnxruntime_cxx_api.h>
#include <tokenizers_cpp.h>
#include <opencv2/core.hpp>
#include <list>
#include <fstream>
#include <sstream>
#include <iostream>
#include <numeric>
#include <algorithm>
#include "util.h"
using tokenizers::Tokenizer;
class Sam3 {
std::unique_ptr<Ort::Session> visionEncoder, textEncoder, decoder;
std::unique_ptr<Tokenizer> tokenizer;
Ort::Env env;
Ort::SessionOptions sessionOptions;
Ort::RunOptions runOptionsEncoder;
Ort::MemoryInfo memoryInfo{Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault)};
std::vector<float> inputTensorValuesFloat; // add this
std::vector<int64_t> inputShapeVision;
std::vector<int64_t> outputShapeVision[4];
std::vector<int64_t> outputShapeVisionBatch[4];
std::vector<float> outputVision[4];
std::vector<float> outputVisionBatch[4];
std::vector<int64_t> inputShapeText[2];
std::vector<int64_t> outputShapeText[2];
std::vector<float> outputText0;
std::vector<uint8_t> outputText1;
std::vector<int64_t> outputShapeDecoder[4];
std::vector<float> outputDecoder[4];
std::vector<std::string> cachedInputNamesVision, cachedOutputNamesVision;
std::vector<std::string> cachedInputNamesText, cachedOutputNamesText;
std::vector<std::string> cachedInputNamesDecoder, cachedOutputNamesDecoder;
// char* pointer vectors — rebuilt once from the above, reused every Run()
std::vector<const char*> ptrInputNamesVision, ptrOutputNamesVision;
std::vector<const char*> ptrInputNamesText, ptrOutputNamesText;
std::vector<const char*> ptrInputNamesDecoder, ptrOutputNamesDecoder;
bool loadingModel = false;
bool preprocessing = false;
bool terminating = false;
public:
Sam3();
~Sam3();
bool clearLoadModel();
void clearVisionBatch();
void clearDecoder();
bool isDecoderEmpty();
void terminatePreprocessing();
bool loadModel(const std::string& visionPath, const std::string& textPath, const std::string& decoderPath, const std::string& tokenizerPath, int threadsNumber, const std::string device);
void loadingStart();
void loadingEnd();
cv::Size getInputSize();
bool preprocessImage(const cv::Mat& image);
void preprocessingStart();
void preprocessingEnd();
bool encodeText(const std::vector<std::string> &text_list);
void alignTextsAndBoxes(std::vector<std::string> *text_list, std::vector<std::vector<cv::Rect2f>> *rects_list, std::vector<std::vector<int>> *labels_list);
void setOutputVisionToInputTensors(int batchSize, std::vector<Ort::Value> *inputTensors);
std::tuple<std::vector<cv::Mat>, std::vector<int>> decode(const std::vector<std::vector<cv::Rect2f>> &rects_list, const std::vector<std::vector<int>> &labels_list, float threshold, const cv::Size &imageSize, bool skipDecode);
std::tuple<std::vector<cv::Mat>, std::vector<int>> changeThreshold(float threshold, const cv::Size &imageSize);
};
#endif