Skip to content

Commit 37112ee

Browse files
authored
Use floret instead of fastText (#157)
fastText is EOL, hence picking up floret, which is forked from fastText and having active developements. Signed-off-by: Jagadeesh Pagadala <jpagadal@qti.qualcomm.com>
1 parent 279356d commit 37112ee

7 files changed

Lines changed: 72 additions & 54 deletions

File tree

contextual-classifier/Artifacts/fasttext_model_supervised.bin renamed to contextual-classifier/Artifacts/floret_model_supervised.bin

File renamed without changes.

contextual-classifier/CMakeLists.txt

Lines changed: 35 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -23,60 +23,61 @@ target_include_directories(ContextualClassifier
2323
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/Include
2424
)
2525

26-
# fastText detection and USE_FASTTEXT macro
27-
option(ENABLE_FASTTEXT "Build ML inference with fastText support" ON)
26+
# floret detection and USE_FLORET macro using pkg-config
27+
option(ENABLE_FLORET "Build ML inference with floret support" ON)
2828

29-
set(FASTTEXT_FOUND FALSE)
30-
set(FASTTEXT_TARGET "")
29+
set(FLORET_FOUND FALSE)
30+
set(FLORET_LIBRARIES "")
31+
set(FLORET_INCLUDE_DIRS "")
3132

32-
if(ENABLE_FASTTEXT)
33-
# First try a CMake package
34-
find_package(fasttext QUIET)
35-
36-
if(TARGET fasttext::fasttext)
37-
set(FASTTEXT_FOUND TRUE)
38-
set(FASTTEXT_TARGET fasttext::fasttext)
39-
elseif(TARGET fasttext)
40-
set(FASTTEXT_FOUND TRUE)
41-
set(FASTTEXT_TARGET fasttext)
42-
else()
43-
# Fallback: probe the linker directly for -lfasttext
44-
include(CheckLibraryExists)
45-
check_library_exists(fasttext fasttext_version "" FASTTEXT_LINKABLE)
46-
if(FASTTEXT_LINKABLE)
47-
set(FASTTEXT_FOUND TRUE)
48-
set(FASTTEXT_TARGET fasttext)
49-
endif()
33+
if(ENABLE_FLORET)
34+
# Use pkg-config to find floret
35+
pkg_check_modules(PC_FLORET QUIET floret)
36+
if(PC_FLORET_FOUND)
37+
message(STATUS "floret not found via pkg-config")
38+
set(FLORET_FOUND TRUE)
39+
set(FLORET_LIBRARIES ${PC_FLORET_LIBRARIES})
40+
set(FLORET_INCLUDE_DIRS ${PC_FLORET_INCLUDE_DIRS})
5041
endif()
5142
endif()
5243

53-
if(FASTTEXT_FOUND)
54-
message(STATUS "fastText found, building MLInference with USE_FASTTEXT=1")
55-
add_definitions(-DUSE_FASTTEXT=1)
44+
if(FLORET_FOUND)
45+
message(STATUS "floret found via pkg-config, building MLInference with USE_FLORET=1")
46+
add_definitions(-DUSE_FLORET=1)
5647

57-
# Define the ML inference library that actually uses fastText
58-
add_library(ml_inference_lib STATIC
48+
# Define the ML inference library that actually uses FLORET
49+
add_library(ml_inference_lib SHARED
5950
FeatureExtractor.cpp
6051
FeaturePruner.cpp
6152
MLInference.cpp
6253
)
6354

55+
set_target_properties(ml_inference_lib PROPERTIES
56+
VERSION 1.0.0
57+
SOVERSION 1
58+
)
59+
60+
target_link_libraries(ContextualClassifier PRIVATE ml_inference_lib)
61+
6462
target_link_libraries(ml_inference_lib
65-
PRIVATE ${FASTTEXT_TARGET}
63+
PRIVATE
64+
${FLORET_LIBRARIES}
6665
)
6766

6867
target_include_directories(ml_inference_lib
69-
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/Include
68+
PUBLIC
69+
${CMAKE_CURRENT_SOURCE_DIR}/Include
70+
${FLORET_INCLUDE_DIRS}
7071
)
7172
else()
72-
message(STATUS "fastText not found or ENABLE_FASTTEXT=OFF — falling back to base Inference only")
73+
message(STATUS "floret not found via pkg-config or ENABLE_FLORET=OFF — falling back to base Inference only")
7374
endif()
7475

7576
# Installation rules
76-
install(TARGETS ContextualClassifier DESTINATION ${CMAKE_INSTALL_LIBDIR})
77-
if(FASTTEXT_FOUND)
78-
install(TARGETS ml_inference_lib DESTINATION ${CMAKE_INSTALL_LIBDIR})
79-
install(FILES ${CMAKE_SOURCE_DIR}/contextual-classifier/Artifacts/fasttext_model_supervised.bin DESTINATION ${CMAKE_INSTALL_SYSCONFDIR}/urm/classifier)
77+
install(TARGETS ContextualClassifier DESTINATION lib)
78+
if(FLORET_FOUND)
79+
install(TARGETS ml_inference_lib DESTINATION lib)
80+
install(FILES ${CMAKE_SOURCE_DIR}/contextual-classifier/Artifacts/floret_model_supervised.bin DESTINATION ${CMAKE_INSTALL_SYSCONFDIR}/urm/classifier)
8081
endif()
8182

8283
install(

contextual-classifier/ContextualClassifier.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,16 @@
2727
#define CLASSIFIER_TAG "CONTEXTUAL_CLASSIFIER"
2828
#define CLASSIFIER_CONFIGS_DIR "/etc/urm/classifier/"
2929

30-
static const std::string FT_MODEL_PATH =
31-
CLASSIFIER_CONFIGS_DIR "fasttext_model_supervised.bin";
32-
static const std::string IGNORE_PROC_PATH =
30+
const std::string FT_MODEL_PATH =
31+
CLASSIFIER_CONFIGS_DIR "floret_model_supervised.bin";
32+
const std::string IGNORE_PROC_PATH =
3333
CLASSIFIER_CONFIGS_DIR "classifier-blocklist.txt";
3434
static const std::string IGNORE_TOKENS_PATH =
3535
CLASSIFIER_CONFIGS_DIR "ignore-tokens.txt";
3636
static const std::string ALLOW_LIST_PATH =
3737
CLASSIFIER_CONFIGS_DIR "allow-list.txt";
3838

39-
#ifdef USE_FASTTEXT
39+
#ifdef USE_FLORET
4040
#include "MLInference.h"
4141
#include "FeatureExtractor.h"
4242
#include "FeaturePruner.h"

contextual-classifier/FeatureExtractor.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
22
// SPDX-License-Identifier: BSD-3-Clause-Clear
33

4+
#include "AuxRoutines.h"
45
#include "FeatureExtractor.h"
56
#include "FeaturePruner.h"
67
#include "Logger.h"
@@ -22,6 +23,8 @@
2223
#define SCANNER_TAG "FeatureExtractor"
2324
#define LOG_LINES 20
2425

26+
std::unordered_map<std::string, std::unordered_set<std::string>> FeatureExtractor::mTokenIgnoreMap;
27+
2528
static std::string format_string(const char *fmt, ...) {
2629
char buffer[1024];
2730
va_list args;

contextual-classifier/Include/ContextualClassifier.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,9 @@ class ContextualClassifier {
113113
public:
114114
ContextualClassifier();
115115
~ContextualClassifier();
116+
Inference *getInference() {
117+
return mInference;
118+
}
116119

117120
ErrCode Init();
118121
ErrCode Terminate();

contextual-classifier/Include/MLInference.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,19 @@
44
#ifndef ML_INFERENCE_H
55
#define ML_INFERENCE_H
66

7+
#include "AuxRoutines.h"
78
#include "Inference.h"
89
#include <vector>
910

10-
#include <fasttext/fasttext.h>
11+
#include <floret/fasttext.h>
1112
#include <mutex>
1213
#include <string>
1314

1415
class MLInference : public Inference {
1516
public:
1617
MLInference(const std::string &ft_model_path);
1718
~MLInference();
18-
int Classify(int process_pid) override;
19+
CC_TYPE Classify(int process_pid) override;
1920
private:
2021
// Derived implementation using fastText.
2122
uint32_t predict(int pid,

contextual-classifier/MLInference.cpp

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include "MLInference.h"
55
#include "ContextualClassifier.h"
6+
#include "FeatureExtractor.h"
67
#include <algorithm>
78
#include <cmath>
89
#include <fstream>
@@ -16,6 +17,15 @@
1617

1718
#define CLASSIFIER_TAG "MLInference"
1819

20+
static std::string format_string(const char *fmt, ...) {
21+
char buffer[1024];
22+
va_list args;
23+
va_start(args, fmt);
24+
vsnprintf(buffer, sizeof(buffer), fmt, args);
25+
va_end(args);
26+
return std::string(buffer);
27+
}
28+
1929
MLInference::MLInference(const std::string &ft_model_path)
2030
: Inference(ft_model_path) {
2131
text_cols_ = {"attr", "cgroup", "cmdline", "comm", "maps",
@@ -46,8 +56,8 @@ std::string MLInference::normalize_text(const std::string &text) {
4656
}
4757

4858
CC_TYPE MLInference::Classify(int process_pid) {
49-
LOGD(CLASSIFIER_TAG,
50-
format_string("Starting classification for PID:%d", process_pid));
59+
//LOGD(CLASSIFIER_TAG,
60+
// format_string("Starting classification for PID:%d", process_pid));
5161

5262
const std::string proc_path = "/proc/" + std::to_string(process_pid);
5363
CC_TYPE contextType = CC_APP;
@@ -56,24 +66,24 @@ CC_TYPE MLInference::Classify(int process_pid) {
5666

5767
auto start_collect = std::chrono::high_resolution_clock::now();
5868
int collect_rc = FeatureExtractor::CollectAndStoreData(
59-
process_pid, raw_data, mDebugMode);
69+
process_pid, raw_data, false);
6070
auto end_collect = std::chrono::high_resolution_clock::now();
6171
std::chrono::duration<double, std::milli> elapsed_collect =
6272
end_collect - start_collect;
63-
LOGD(CLASSIFIER_TAG,
64-
format_string("Data collection for PID:%d took %f ms (rc=%d)",
65-
process_pid, elapsed_collect.count(), collect_rc));
73+
//LOGD(CLASSIFIER_TAG,
74+
// format_string("Data collection for PID:%d took %f ms (rc=%d)",
75+
// process_pid, elapsed_collect.count(), collect_rc));
6676

6777
if (collect_rc != 0) {
6878
// Process exited or collection failed; skip further work.
69-
return 0;
79+
return contextType;
7080
}
7181

72-
LOGD(CLASSIFIER_TAG,
73-
format_string("Text features collected for PID:%d", process_pid));
82+
//LOGD(CLASSIFIER_TAG,
83+
// format_string("Text features collected for PID:%d", process_pid));
7484

7585
if (!AuxRoutines::fileExists(proc_path)) {
76-
return 0;
86+
return contextType;
7787
}
7888

7989
bool has_sufficient_features = false;
@@ -93,8 +103,8 @@ CC_TYPE MLInference::Classify(int process_pid) {
93103
format_string("Invoking ML inference for PID:%d", process_pid));
94104

95105
auto start_inference = std::chrono::high_resolution_clock::now();
96-
if (mInference) {
97-
uint32_t rc = mInference->predict(process_pid, raw_data, predicted_label);
106+
//if (Inference) {
107+
uint32_t rc = predict(process_pid, raw_data, predicted_label);
98108
auto end_inference = std::chrono::high_resolution_clock::now();
99109
std::chrono::duration<double, std::milli> elapsed_inference =
100110
end_inference - start_inference;
@@ -105,11 +115,11 @@ CC_TYPE MLInference::Classify(int process_pid) {
105115
// Inference failed, keep contextType as UNKNOWN.
106116
predicted_label.clear();
107117
}
108-
} else {
118+
/*} else {
109119
LOGW(CLASSIFIER_TAG,
110120
format_string("No Inference object available for PID:%d",
111121
process_pid));
112-
}
122+
}*/
113123

114124
// Map stripped label -> CC_APP enum.
115125
// MLInference::predict() returns after stripping "__label__".

0 commit comments

Comments
 (0)