huggingface
diff --git a/‎Dockerfile-cuda-all‎
Lines changed: 20 additions & 0 deletions b/‎Dockerfile-cuda-all‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎README_MODIFICATIONS.md‎
Lines changed: 82 additions & 0 deletions b/‎README_MODIFICATIONS.md‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎backends/candle/src/compute_cap.rs‎
Lines changed: 17 additions & 4 deletions b/‎backends/candle/src/compute_cap.rs‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎backends/candle/src/layers/linear.rs‎
Lines changed: 6 additions & 3 deletions b/‎backends/candle/src/layers/linear.rs‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎backends/candle/src/lib.rs‎
Lines changed: 34 additions & 4 deletions b/‎backends/candle/src/lib.rs‎
Lines changed: 34 additions & 4 deletions
@@ -63,7 +63,13 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
     CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s;
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=87 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s;
 
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=89 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s;
 COPY backends backends
 COPY core core
 COPY router router
@@ -82,6 +88,18 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
 
 RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-80
 
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=87 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s;
+
+RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-87
+
+RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
+    --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
+    CUDA_COMPUTE_CAP=89 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s;
+
+RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-89
+
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
     CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s;
@@ -104,6 +122,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
 
 COPY --from=builder /usr/src/target/release/text-embeddings-router-75 /usr/local/bin/text-embeddings-router-75
 COPY --from=builder /usr/src/target/release/text-embeddings-router-80 /usr/local/bin/text-embeddings-router-80
+COPY --from=builder /usr/src/target/release/text-embeddings-router-87 /usr/local/bin/text-embeddings-router-87
+COPY --from=builder /usr/src/target/release/text-embeddings-router-89 /usr/local/bin/text-embeddings-router-89
 COPY --from=builder /usr/src/target/release/text-embeddings-router-90 /usr/local/bin/text-embeddings-router-90
 
 COPY --chmod=775 cuda-all-entrypoint.sh entrypoint.sh
 
@@ -0,0 +1,82 @@
+# Text Embeddings Inference - SM87 适配版本
+
+## 修改内容
+
+本版本基于 text-embeddings-inference 项目，针对 NVIDIA Jetson Orin (SM87) 和 L4 GPU (SM89) 进行了适配，并集成了以下社区 PR：
+
+### 1. SM87/SM89 CUDA 支持
+- 支持 NVIDIA Jetson Orin AGX (compute capability 8.7)
+- 支持 NVIDIA L4 GPU (compute capability 8.9)
+- 修改文件：
+  - `Dockerfile-cuda-all`
+  - `cuda-all-entrypoint.sh`
+  - `backends/candle/src/compute_cap.rs`
+
+### 2. PR #730: Qwen3 Reranker 支持
+- 添加 Qwen3 分类头用于重排序任务
+- 实现模板格式化系统支持聊天格式
+- 修改文件：
+  - `backends/candle/src/models/qwen3.rs`
+  - `core/src/templates.rs` (新增)
+  - `core/src/lib.rs`
+
+### 3. PR #787: 批处理通知性能优化
+- 使用 AtomicUsize 计数器优化批处理场景的线程通知
+- 仅在批处理最后一个请求时触发通知，减少不必要的 notify_one() 调用
+- 修改文件：
+  - `core/src/infer.rs`
+  - `router/src/http/server.rs`
+  - `router/src/grpc/server.rs`
+
+### 4. PR #753: GeLU 激活函数一致性修复
+- 将 Gelu 从近似版本 (gelu) 改为精确版本 (gelu_erf)
+- 添加 NewGelu 变体保持向后兼容
+- 修改文件：
+  - `backends/candle/src/layers/linear.rs`
+
+### 5. PR #790: StaticEmbedding 模型支持
+- 支持 sentence-transformers 的 0_StaticEmbedding/ 目录结构
+- 添加模型权重和 tokenizer 的 fallback 加载逻辑
+- 为 StaticEmbedding 模型默认使用 Mean pooling
+- 修改文件：
+  - `backends/candle/src/models/static_embedding.rs` (新增)
+  - `backends/candle/src/lib.rs`
+  - `backends/src/lib.rs`
+  - `core/src/download.rs`
+  - `router/src/lib.rs`
+
+### 6. PR #746: DebertaV2 序列分类支持
+- 添加完整的 DebertaV2 模型实现
+- 支持序列分类任务（如 Llama Prompt Guard）
+- 支持 CPU 和 CUDA 设备
+- 修改文件：
+  - `backends/candle/src/models/debertav2.rs` (新增)
+  - `backends/candle/src/lib.rs`
+  - `backends/candle/src/models/mod.rs`
+
+## 编译验证
+
+所有修改已通过编译检查：
+```bash
+cargo check --all-targets
+Finished `dev` profile [unoptimized + debuginfo] target(s) in 23.76s
+```
+
+## 部署说明
+
+### 构建 Docker 镜像（支持 SM87/SM89）
+```bash
+docker build -f Dockerfile-cuda-all -t tei-sm87:latest .
+```
+
+### 运行示例
+```bash
+docker run --gpus all -p 8080:80 \
+  -v $PWD/data:/data \
+  tei-sm87:latest \
+  --model-id BAAI/bge-large-zh-v1.5 \
+  --pooling mean
+```
+
+## 修改日期
+2026年1月5日
@@ -26,8 +26,9 @@ pub fn get_runtime_compute_cap() -> Result<usize, anyhow::Error> {
 fn compute_cap_matching(runtime_compute_cap: usize, compile_compute_cap: usize) -> bool {
     match (runtime_compute_cap, compile_compute_cap) {
         (75, 75) => true,
-        (80..=89, 80) => true,
-        (86..=89, 80..=86) => true,
+        (80..=86, 80) => true,
+        (86..=86, 80..=86) => true,
+        (87, 87) => true,
         (89, 89) => true,
         (90, 90) => true,
         (_, _) => false,
@@ -52,33 +53,45 @@ mod tests {
         assert!(compute_cap_matching(75, 75));
         assert!(compute_cap_matching(80, 80));
         assert!(compute_cap_matching(86, 86));
+        assert!(compute_cap_matching(87, 87));
         assert!(compute_cap_matching(89, 89));
         assert!(compute_cap_matching(90, 90));
 
         assert!(compute_cap_matching(86, 80));
-        assert!(compute_cap_matching(89, 80));
-        assert!(compute_cap_matching(89, 86));
 
         assert!(!compute_cap_matching(75, 80));
         assert!(!compute_cap_matching(75, 86));
+        assert!(!compute_cap_matching(75, 87));
         assert!(!compute_cap_matching(75, 89));
         assert!(!compute_cap_matching(75, 90));
 
         assert!(!compute_cap_matching(80, 75));
         assert!(!compute_cap_matching(80, 86));
+        assert!(!compute_cap_matching(80, 87));
         assert!(!compute_cap_matching(80, 89));
         assert!(!compute_cap_matching(80, 90));
 
         assert!(!compute_cap_matching(86, 75));
+        assert!(!compute_cap_matching(86, 87));
         assert!(!compute_cap_matching(86, 89));
         assert!(!compute_cap_matching(86, 90));
 
+        assert!(!compute_cap_matching(87, 75));
+        assert!(!compute_cap_matching(87, 80));
+        assert!(!compute_cap_matching(87, 86));
+        assert!(!compute_cap_matching(87, 89));
+        assert!(!compute_cap_matching(87, 90));
+
         assert!(!compute_cap_matching(89, 75));
+        assert!(!compute_cap_matching(89, 80));
+        assert!(!compute_cap_matching(89, 86));
+        assert!(!compute_cap_matching(89, 87));
         assert!(!compute_cap_matching(89, 90));
 
         assert!(!compute_cap_matching(90, 75));
         assert!(!compute_cap_matching(90, 80));
         assert!(!compute_cap_matching(90, 86));
+        assert!(!compute_cap_matching(90, 87));
         assert!(!compute_cap_matching(90, 89));
     }
 }
@@ -5,8 +5,9 @@ use serde::Deserialize;
 #[derive(Debug, Deserialize, PartialEq, Clone)]
 #[serde(rename_all = "lowercase")]
 pub enum HiddenAct {
-    #[serde(alias = "gelu_pytorch_tanh")]
     Gelu,
+    #[serde(alias = "gelu_new", alias = "gelu_pytorch_tanh")]
+    NewGelu,
     Relu,
     Silu,
     Swiglu,
@@ -15,7 +16,8 @@ pub enum HiddenAct {
 impl HiddenAct {
     pub fn forward(&self, x: &Tensor) -> Result<Tensor> {
         match self {
-            Self::Gelu => x.gelu(),
+            Self::Gelu => x.gelu_erf(),
+            Self::NewGelu => x.gelu(),
             Self::Relu => x.relu(),
             Self::Silu => x.silu(),
             Self::Swiglu => candle_nn::ops::swiglu(x),
@@ -84,7 +86,8 @@ impl Linear {
 
             if let Some(act) = &self.act {
                 match act {
-                    HiddenAct::Gelu => x.gelu(),
+                    HiddenAct::Gelu => x.gelu_erf(),
+                    HiddenAct::NewGelu => x.gelu(),
                     HiddenAct::Relu => x.relu(),
                     HiddenAct::Silu => x.silu(),
                     HiddenAct::Swiglu => candle_nn::ops::swiglu(&x),
 
@@ -22,10 +22,11 @@ use crate::compute_cap::{
     compatible_compute_cap, get_compile_compute_cap, get_runtime_compute_cap,
 };
 use crate::models::{
-    BertConfig, BertModel, Dense, DenseConfig, DenseLayer, DistilBertConfig, DistilBertModel,
-    GTEConfig, GTEModel, Gemma3Config, Gemma3Model, JinaBertModel, JinaCodeBertModel, MPNetConfig,
-    MPNetModel, MistralConfig, Model, ModernBertConfig, ModernBertModel, NomicBertModel,
-    NomicConfig, Qwen2Config, Qwen3Config, Qwen3Model,
+    BertConfig, BertModel, DebertaV2Config, DebertaV2Model, Dense, DenseConfig, DenseLayer,
+    DistilBertConfig, DistilBertModel, GTEConfig, GTEModel, Gemma3Config, Gemma3Model,
+    JinaBertModel, JinaCodeBertModel, MPNetConfig, MPNetModel, MistralConfig, Model,
+    ModernBertConfig, ModernBertModel, NomicBertModel, NomicConfig, Qwen2Config, Qwen3Config,
+    Qwen3Model, StaticEmbeddingConfig, StaticEmbeddingModel,
 };
 #[cfg(feature = "cuda")]
 use crate::models::{
@@ -93,6 +94,8 @@ impl<'de> Deserialize<'de> for BertConfigWrapper {
 enum Config {
     Bert(BertConfigWrapper),
     Camembert(BertConfig),
+    #[serde(rename(deserialize = "deberta-v2"))]
+    DebertaV2(DebertaV2Config),
     #[serde(rename(deserialize = "distilbert"))]
     DistilBert(DistilBertConfig),
     #[serde(rename(deserialize = "gemma3_text"))]
@@ -112,6 +115,8 @@ enum Config {
     #[allow(dead_code)]
     Qwen3(Qwen3Config),
     Roberta(BertConfig),
+    #[serde(rename(deserialize = "static-embedding"))]
+    StaticEmbedding(StaticEmbeddingConfig),
     XlmRoberta(BertConfig),
 }
 
@@ -131,12 +136,15 @@ impl CandleBackend {
         // Default files
         let default_safetensors = model_path.join("model.safetensors");
         let default_pytorch = model_path.join("pytorch_model.bin");
+        let static_embedding_safetensors = model_path.join("0_StaticEmbedding/model.safetensors");
 
         // Single Files
         let model_files = if default_safetensors.exists() {
             vec![default_safetensors]
         } else if default_pytorch.exists() {
             vec![default_pytorch]
+        } else if static_embedding_safetensors.exists() {
+            vec![static_embedding_safetensors]
         }
         // Sharded weights
         else {
@@ -259,6 +267,10 @@ impl CandleBackend {
                     BertModel::load_roberta(vb, &config, model_type).s()?,
                 ))
             }
+            (Config::DebertaV2(config), Device::Cpu | Device::Metal(_)) => {
+                tracing::info!("Starting DebertaV2 model on {:?}", device);
+                Ok(Box::new(DebertaV2Model::load(vb, &config, model_type).s()?))
+            }
             (Config::DistilBert(config), Device::Cpu | Device::Metal(_)) => {
                 tracing::info!("Starting DistilBert model on {:?}", device);
                 Ok(Box::new(
@@ -305,6 +317,12 @@ impl CandleBackend {
                 tracing::info!("Starting Qwen3 model on {:?}", device);
                 Ok(Box::new(Qwen3Model::load(vb, &config, model_type).s()?))
             }
+            (Config::StaticEmbedding(config), Device::Cpu | Device::Metal(_)) => {
+                tracing::info!("Starting StaticEmbedding model on {:?}", device);
+                Ok(Box::new(
+                    StaticEmbeddingModel::load(vb, &config, model_type).s()?,
+                ))
+            }
             #[cfg(feature = "cuda")]
             (Config::Bert(config), Device::Cuda(_)) => {
                 if cfg!(any(feature = "flash-attn", feature = "flash-attn-v1"))
@@ -373,6 +391,11 @@ impl CandleBackend {
                 }
             }
             #[cfg(feature = "cuda")]
+            (Config::DebertaV2(config), Device::Cuda(_)) => {
+                tracing::info!("Starting DebertaV2 model on {:?}", device);
+                Ok(Box::new(DebertaV2Model::load(vb, &config, model_type).s()?))
+            }
+            #[cfg(feature = "cuda")]
             (Config::DistilBert(config), Device::Cuda(_)) => {
                 if cfg!(feature = "flash-attn")
                     && dtype == DType::F16
@@ -509,6 +532,13 @@ impl CandleBackend {
                     ))
                 }
             }
+            #[cfg(feature = "cuda")]
+            (Config::StaticEmbedding(config), Device::Cuda(_)) => {
+                tracing::info!("Starting StaticEmbedding model on {:?}", device);
+                Ok(Box::new(
+                    StaticEmbeddingModel::load(vb, &config, model_type).s()?,
+                ))
+            }
         };
 
         let mut dense_layers = Vec::new();