From d405e2b344de2558f9f94ac1acc9acdce4339ee5 Mon Sep 17 00:00:00 2001 From: Jim McKeeth Date: Thu, 18 Jun 2026 23:50:33 -0600 Subject: [PATCH] feature: updated alternative embedding options after testing --- EMBEDDINGS.md | 25 +++++++++-------- scripts/MTEB-RANKINGS.md | 55 +++++++++++++++++++++++++++++++++++++ scripts/find_best_models.py | 8 +++++- 3 files changed, 76 insertions(+), 12 deletions(-) create mode 100644 scripts/MTEB-RANKINGS.md diff --git a/EMBEDDINGS.md b/EMBEDDINGS.md index 42cd1dd..bccdc72 100644 --- a/EMBEDDINGS.md +++ b/EMBEDDINGS.md @@ -68,14 +68,15 @@ This option runs embedding models directly on your machine using the library. ### Recommended Models -These are based on MTEB [datasets](https://huggingface.co/datasets/mteb/results) as of 13-Jun-2026. +These are based on MTEB [datasets](https://huggingface.co/datasets/mteb/results) as of 15-Jun-2026. All listed models have been verified to work with the `sentence-transformers` provider in `cocoindex-code`. | Tier | Model | Params | Code Score | Best For | | :--- | :--- | :--- | :--- | :--- | -| **Micro** | [`Snowflake/arctic-embed-xs`](https://huggingface.co/Snowflake/snowflake-arctic-embed-xs) | 22M | 0.67 | Old CPUs, minimal RAM usage. | -| **Small** | [`ibm-granite/granite-embedding-97m-multilingual-r2`](https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2) | 97M | 0.80 | Modern laptops, multilingual code. | -| **Medium** | [`jinaai/jina-embeddings-v5-text-nano`](https://huggingface.co/jinaai/jina-embeddings-v5-text-nano) | 239M | **0.90** | **Performance sweet spot.** BERT-based (Fast). | -| **High** | [`geevec-ai/geevec-embeddings-1.0-lite`](https://huggingface.co/geevec-ai/geevec-embeddings-1.0-lite) | 366M | **0.92** | Maximum local accuracy (needs GPU for speed). | +| **Default** | [`Snowflake/arctic-embed-xs`](https://huggingface.co/Snowflake/snowflake-arctic-embed-xs) | 22M | 0.67 | Default | +| **Micro** | [`lightonai/LateOn-Code-edge`](https://huggingface.co/lightonai/LateOn-Code-edge) | 17M | 0.82 | **Efficiency King.** Incredible code performance for its size. | +| **Small** | [`lightonai/LateOn-Code`](https://huggingface.co/lightonai/LateOn-Code) | 149M | 0.85 | Great balance of speed and accuracy on modern laptops. | +| **Medium** | [`microsoft/harrier-oss-v1-270m`](https://huggingface.co/microsoft/harrier-oss-v1-270m) | 270M | **0.90** | **Performance sweet spot.** High accuracy, runs well on CPUs. | +| **Multi** | [`ibm-granite/granite-embedding-97m-multilingual-r2`](https://huggingface.co/ibm-granite/granite-embedding-97m-multilingual-r2) | 97M | 0.80 | Multilingual codebases (e.g. Code + Docs in different languages). | #### Other Model Options @@ -190,8 +191,8 @@ envs: ## Choosing Based on Your Content -- **Heavy Source Code**: Use **Jina v5 Nano** (Local) or **Voyage 4 Large** (Cloud). Both score >0.90 on code search benchmarks. -- **Large Documentation / Files**: Models with large context windows (8k+ tokens) like **Jina v5** (32k) or **OpenAI v3 Large** (8k). +- **Heavy Source Code**: Use **LateOn-Code** (Micro/Small) or **Harrier 270m** (Medium). Both score >0.85 on code search benchmarks. +- **Large Documentation / Files**: Models with large context windows like **Voyage 4 Large** (Cloud) or **OpenAI v3 Large** (8k). - **Multilingual Projects**: **Granite 97m** (Small Local) or **Cohere Multilingual v3** (Cloud). ### Fine-Tuning with `indexing_params` and `query_params` @@ -210,16 +211,18 @@ embedding: input_type: query ``` -**Example for Sentence-Transformers (Jina):** +**Example for Sentence-Transformers (Harrier):** ```yaml embedding: provider: sentence-transformers - model: jinaai/jina-embeddings-v5-text-nano + model: microsoft/harrier-oss-v1-270m + # Most encoder-only models don't require explicit prompts, + # but some (like Nomic or BGE) do: indexing_params: - prompt_name: retrieval.passage + prompt_name: null query_params: - prompt_name: retrieval.query + prompt_name: null ``` --- diff --git a/scripts/MTEB-RANKINGS.md b/scripts/MTEB-RANKINGS.md new file mode 100644 index 0000000..f4d12c4 --- /dev/null +++ b/scripts/MTEB-RANKINGS.md @@ -0,0 +1,55 @@ +# MTEB Model Discovery Report + +> **Data Freshness**: MTEB results dataset last updated on `2026-06-15`. + +## Top Embedding Models for Code Search + +### Tier: Micro (< 50M) + +| Model | Code Search Score | General Retrieval Score | Params (M) | +|-------------------------------------------|---------------------|---------------------------|--------------| +| lightonai/LateOn-Code-edge | 0.816549 | nan | 17 | +| lightonai/LateOn-Code-edge-pretrain | 0.791693 | nan | 16.798 | +| thenlper/gte-small | 0.781565 | 0.479423 | 33 | +| avsolatorio/GIST-small-Embedding-v0 | 0.772521 | 0.480646 | 33.36 | +| avsolatorio/NoInstruct-small-Embedding-v0 | 0.770071 | 0.488884 | 33.36 | + +### Tier: Small (< 150M) + +| Model | Code Search Score | General Retrieval Score | Params (M) | +|---------------------------------------------------|---------------------|---------------------------|--------------| +| lightonai/LateOn-Code | 0.851318 | nan | 149 | +| lightonai/LateOn-Code-pretrain | 0.832574 | nan | 149.016 | +| ibm-granite/granite-embedding-97m-multilingual-r2 | 0.799971 | 0.446515 | 97 | +| avsolatorio/GIST-Embedding-v0 | 0.78981 | 0.503411 | 109.482 | +| thenlper/gte-base | 0.789403 | 0.496155 | 109 | + +### Tier: Medium (< 500M) + +| Model | Code Search Score | General Retrieval Score | Params (M) | +|-------------------------------------------|---------------------|---------------------------|--------------| +| geevec-ai/geevec-embeddings-1.0-lite | 0.92365 | 0.53474 | 366 | +| jinaai/jina-embeddings-v5-text-nano | 0.90384 | 0.535934 | 239 | +| microsoft/harrier-oss-v1-270m | 0.89605 | 0.425505 | 270 | +| Shuu12121/CodeSearch-ModernBERT-Crow-Plus | 0.892957 | nan | 151.668 | +| codefuse-ai/F2LLM-v2-330M | 0.842182 | 0.475202 | 334 | + +### Tier: Large (> 500M) + +| Model | Code Search Score | General Retrieval Score | Params (M) | +|------------------------------------------|---------------------|---------------------------|--------------| +| voyageai/voyage-4-large | 0.97726 | nan | nan | +| voyageai/voyage-4-large (embed_dim=2048) | 0.97719 | nan | nan | +| google/gemini-embedding-2-preview | 0.972905 | nan | nan | +| microsoft/harrier-oss-v1-27b | 0.96994 | 0.483455 | 27009.3 | +| Octen/Octen-Embedding-8B-INT8 | 0.967965 | nan | 7567.3 | + +--- + +## How to Regenerate this Report + +This report was generated using the `find_best_models.py` script. To update it with the latest live data from MTEB, run: + +```bash +uv run scripts/find_best_models.py --clear-cache --output MTEB-RANKINGS.md +``` diff --git a/scripts/find_best_models.py b/scripts/find_best_models.py index 8b89fbb..ef425f0 100644 --- a/scripts/find_best_models.py +++ b/scripts/find_best_models.py @@ -163,6 +163,12 @@ def main(): "jinaai/jina-embeddings-v5-text-nano": 239, "ibm-granite/granite-embedding-97m-multilingual-r2": 97, "geevec-ai/geevec-embeddings-1.0-lite": 366, + "lightonai/LateOn-Code-edge": 17, + "lightonai/LateOn-Code": 149, + "microsoft/harrier-oss-v1-270m": 270, + "thenlper/gte-small": 33, + "thenlper/gte-base": 109, + "codefuse-ai/F2LLM-v2-330M": 334, } def categorize(size): @@ -178,7 +184,7 @@ def categorize(size): print("Analyzing top candidates to determine hardware tiers...", file=sys.stderr) results["max_score"] = results[["score_general", "score_code"]].max(axis=1) - results = results.sort_values(by="max_score", ascending=False).head(500) + results = results.sort_values(by="max_score", ascending=False).head(1000) results["size_mb"] = results["model_name"].map(known_sizes)