Skip to content

Commit dd16683

Browse files
Mijamind719codex
andcommitted
feat: add local llama-cpp embedding support
Co-authored-by: GPT-5.4 <noreply@openai.com>
1 parent c6e8de9 commit dd16683

14 files changed

Lines changed: 908 additions & 24 deletions

openviking/models/embedder/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
except ImportError:
3434
GeminiDenseEmbedder = None # google-genai not installed
3535
from openviking.models.embedder.jina_embedders import JinaDenseEmbedder
36+
from openviking.models.embedder.local_embedders import LocalDenseEmbedder
3637

3738
try:
3839
from openviking.models.embedder.litellm_embedders import LiteLLMDenseEmbedder
@@ -66,6 +67,7 @@
6667
"GeminiDenseEmbedder",
6768
# Jina AI implementations
6869
"JinaDenseEmbedder",
70+
"LocalDenseEmbedder",
6971
# LiteLLM implementations
7072
"LiteLLMDenseEmbedder",
7173
# MiniMax implementations

openviking/models/embedder/base.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,22 @@ def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedRes
157157
"""
158158
return [self.embed(text, is_query=is_query) for text in texts]
159159

160+
def embed_query(self, text: str) -> EmbedResult:
161+
"""Embed query text with explicit retrieval-side semantics."""
162+
return self.embed(text, is_query=True)
163+
164+
def embed_document(self, text: str) -> EmbedResult:
165+
"""Embed document text with explicit indexing-side semantics."""
166+
return self.embed(text, is_query=False)
167+
168+
def embed_batch_query(self, texts: List[str]) -> List[EmbedResult]:
169+
"""Batch embed query texts."""
170+
return self.embed_batch(texts, is_query=True)
171+
172+
def embed_batch_document(self, texts: List[str]) -> List[EmbedResult]:
173+
"""Batch embed document texts."""
174+
return self.embed_batch(texts, is_query=False)
175+
160176
async def embed_async(self, text: str, is_query: bool = False) -> EmbedResult:
161177
"""Async embed single text.
162178
@@ -175,6 +191,18 @@ async def embed_batch_async(
175191
results.append(await self.embed_async(text, is_query=is_query))
176192
return results
177193

194+
async def embed_query_async(self, text: str) -> EmbedResult:
195+
return await self.embed_async(text, is_query=True)
196+
197+
async def embed_document_async(self, text: str) -> EmbedResult:
198+
return await self.embed_async(text, is_query=False)
199+
200+
async def embed_batch_query_async(self, texts: List[str]) -> List[EmbedResult]:
201+
return await self.embed_batch_async(texts, is_query=True)
202+
203+
async def embed_batch_document_async(self, texts: List[str]) -> List[EmbedResult]:
204+
return await self.embed_batch_async(texts, is_query=False)
205+
178206
def close(self):
179207
"""Release resources, subclasses can override as needed"""
180208
pass
Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
2+
# SPDX-License-Identifier: AGPL-3.0
3+
"""Local GGUF embedders powered by llama-cpp-python."""
4+
5+
from __future__ import annotations
6+
7+
import importlib
8+
import logging
9+
import os
10+
from dataclasses import dataclass
11+
from pathlib import Path
12+
from typing import Any, Dict, List, Optional
13+
14+
import requests
15+
16+
from openviking.models.embedder.base import DenseEmbedderBase, EmbedResult
17+
from openviking.storage.errors import EmbeddingConfigurationError
18+
19+
logger = logging.getLogger(__name__)
20+
21+
DEFAULT_LOCAL_MODEL_CACHE_DIR = "~/.cache/openviking/models"
22+
DEFAULT_LOCAL_DENSE_MODEL = "bge-small-zh-v1.5-f16"
23+
DEFAULT_BGE_ZH_QUERY_INSTRUCTION = "为这个句子生成表示以用于检索相关文章:"
24+
25+
26+
@dataclass(frozen=True)
27+
class LocalModelSpec:
28+
model_name: str
29+
dimension: int
30+
filename: str
31+
download_url: str
32+
query_instruction: Optional[str] = None
33+
34+
35+
LOCAL_DENSE_MODEL_SPECS: Dict[str, LocalModelSpec] = {
36+
DEFAULT_LOCAL_DENSE_MODEL: LocalModelSpec(
37+
model_name=DEFAULT_LOCAL_DENSE_MODEL,
38+
dimension=512,
39+
filename="bge-small-zh-v1.5-f16.gguf",
40+
download_url=(
41+
"https://huggingface.co/CompendiumLabs/bge-small-zh-v1.5-gguf/resolve/main/"
42+
"bge-small-zh-v1.5-f16.gguf?download=true"
43+
),
44+
query_instruction=DEFAULT_BGE_ZH_QUERY_INSTRUCTION,
45+
)
46+
}
47+
48+
49+
def get_local_model_spec(model_name: str) -> LocalModelSpec:
50+
try:
51+
return LOCAL_DENSE_MODEL_SPECS[model_name]
52+
except KeyError as exc:
53+
raise ValueError(
54+
f"Unknown local embedding model '{model_name}'. "
55+
f"Supported models: {list(LOCAL_DENSE_MODEL_SPECS.keys())}"
56+
) from exc
57+
58+
59+
def get_local_model_default_dimension(model_name: str) -> int:
60+
return get_local_model_spec(model_name).dimension
61+
62+
63+
def get_local_model_cache_path(model_name: str, cache_dir: Optional[str] = None) -> Path:
64+
spec = get_local_model_spec(model_name)
65+
cache_root = Path(cache_dir or DEFAULT_LOCAL_MODEL_CACHE_DIR).expanduser().resolve()
66+
return cache_root / spec.filename
67+
68+
69+
def get_local_model_identity(model_name: str, model_path: Optional[str] = None) -> str:
70+
if model_path:
71+
resolved = Path(model_path).expanduser().resolve()
72+
return str(resolved)
73+
return get_local_model_spec(model_name).filename
74+
75+
76+
class LocalDenseEmbedder(DenseEmbedderBase):
77+
"""Dense embedder backed by a local GGUF model via llama-cpp-python."""
78+
79+
def __init__(
80+
self,
81+
model_name: str = DEFAULT_LOCAL_DENSE_MODEL,
82+
model_path: Optional[str] = None,
83+
cache_dir: Optional[str] = None,
84+
dimension: Optional[int] = None,
85+
query_instruction: Optional[str] = None,
86+
config: Optional[Dict[str, Any]] = None,
87+
):
88+
runtime_config = dict(config or {})
89+
runtime_config.setdefault("provider", "local")
90+
super().__init__(model_name, runtime_config)
91+
92+
self.model_spec = get_local_model_spec(model_name)
93+
self.model_path = model_path
94+
self.cache_dir = cache_dir or DEFAULT_LOCAL_MODEL_CACHE_DIR
95+
self.query_instruction = (
96+
query_instruction
97+
if query_instruction is not None
98+
else self.model_spec.query_instruction
99+
)
100+
self._dimension = dimension or self.model_spec.dimension
101+
if self._dimension != self.model_spec.dimension:
102+
raise ValueError(
103+
f"Local model '{model_name}' has fixed dimension {self.model_spec.dimension}, "
104+
f"but got dimension={self._dimension}"
105+
)
106+
107+
self._resolved_model_path = self._resolve_model_path()
108+
self._llama = self._load_model()
109+
110+
def _import_llama(self):
111+
try:
112+
module = importlib.import_module("llama_cpp")
113+
except ImportError as exc:
114+
raise EmbeddingConfigurationError(
115+
"Local embedding is enabled but 'llama-cpp-python' is not installed. "
116+
'Install it with: pip install "openviking[local-embed]". '
117+
"If you prefer a remote provider, set embedding.dense.provider explicitly in ov.conf."
118+
) from exc
119+
120+
llama_cls = getattr(module, "Llama", None)
121+
if llama_cls is None:
122+
raise EmbeddingConfigurationError(
123+
"llama_cpp.Llama is unavailable in the installed llama-cpp-python package."
124+
)
125+
return llama_cls
126+
127+
def _resolve_model_path(self) -> Path:
128+
if self.model_path:
129+
resolved = Path(self.model_path).expanduser().resolve()
130+
if not resolved.exists():
131+
raise EmbeddingConfigurationError(
132+
f"Local embedding model file not found: {resolved}"
133+
)
134+
return resolved
135+
136+
cache_root = Path(self.cache_dir).expanduser().resolve()
137+
cache_root.mkdir(parents=True, exist_ok=True)
138+
target = get_local_model_cache_path(self.model_name, self.cache_dir)
139+
if target.exists():
140+
return target
141+
142+
self._download_model(self.model_spec.download_url, target)
143+
return target
144+
145+
def _download_model(self, url: str, target: Path) -> None:
146+
logger.info("Downloading local embedding model %s to %s", self.model_name, target)
147+
tmp_target = target.with_suffix(target.suffix + ".part")
148+
try:
149+
with requests.get(url, stream=True, timeout=(10, 300)) as response:
150+
response.raise_for_status()
151+
with tmp_target.open("wb") as fh:
152+
for chunk in response.iter_content(chunk_size=1024 * 1024):
153+
if chunk:
154+
fh.write(chunk)
155+
os.replace(tmp_target, target)
156+
except Exception as exc:
157+
tmp_target.unlink(missing_ok=True)
158+
raise EmbeddingConfigurationError(
159+
f"Failed to download local embedding model '{self.model_name}' from {url} "
160+
f"to {target}: {exc}"
161+
) from exc
162+
163+
def _load_model(self):
164+
llama_cls = self._import_llama()
165+
try:
166+
return llama_cls(
167+
model_path=str(self._resolved_model_path),
168+
embedding=True,
169+
verbose=False,
170+
)
171+
except Exception as exc:
172+
raise EmbeddingConfigurationError(
173+
f"Failed to load GGUF embedding model from {self._resolved_model_path}: {exc}"
174+
) from exc
175+
176+
def _format_text(self, text: str, *, is_query: bool) -> str:
177+
if is_query and self.query_instruction:
178+
return f"{self.query_instruction}{text}"
179+
return text
180+
181+
@staticmethod
182+
def _extract_embedding(payload: Any) -> List[float]:
183+
if isinstance(payload, dict):
184+
data = payload.get("data")
185+
if isinstance(data, list) and data:
186+
item = data[0]
187+
if isinstance(item, dict) and "embedding" in item:
188+
return list(item["embedding"])
189+
if "embedding" in payload:
190+
return list(payload["embedding"])
191+
raise RuntimeError("Unexpected llama-cpp-python embedding response format")
192+
193+
@staticmethod
194+
def _extract_embeddings(payload: Any) -> List[List[float]]:
195+
if isinstance(payload, dict):
196+
data = payload.get("data")
197+
if isinstance(data, list):
198+
vectors: List[List[float]] = []
199+
for item in data:
200+
if not isinstance(item, dict) or "embedding" not in item:
201+
raise RuntimeError(
202+
"Unexpected llama-cpp-python batch embedding response format"
203+
)
204+
vectors.append(list(item["embedding"]))
205+
return vectors
206+
raise RuntimeError("Unexpected llama-cpp-python batch embedding response format")
207+
208+
def embed(self, text: str, is_query: bool = False) -> EmbedResult:
209+
formatted = self._format_text(text, is_query=is_query)
210+
211+
def _call() -> EmbedResult:
212+
payload = self._llama.create_embedding(formatted)
213+
return EmbedResult(dense_vector=self._extract_embedding(payload))
214+
215+
try:
216+
result = self._run_with_retry(
217+
_call,
218+
logger=logger,
219+
operation_name="local embedding",
220+
)
221+
except Exception as exc:
222+
raise RuntimeError(f"Local embedding failed: {exc}") from exc
223+
224+
estimated_tokens = self._estimate_tokens(formatted)
225+
self.update_token_usage(
226+
model_name=self.model_name,
227+
provider="local",
228+
prompt_tokens=estimated_tokens,
229+
completion_tokens=0,
230+
)
231+
return result
232+
233+
def embed_batch(self, texts: List[str], is_query: bool = False) -> List[EmbedResult]:
234+
if not texts:
235+
return []
236+
237+
formatted = [self._format_text(text, is_query=is_query) for text in texts]
238+
239+
def _call() -> List[EmbedResult]:
240+
payload = self._llama.create_embedding(formatted)
241+
return [
242+
EmbedResult(dense_vector=vector) for vector in self._extract_embeddings(payload)
243+
]
244+
245+
try:
246+
results = self._run_with_retry(
247+
_call,
248+
logger=logger,
249+
operation_name="local batch embedding",
250+
)
251+
except Exception as exc:
252+
raise RuntimeError(f"Local batch embedding failed: {exc}") from exc
253+
254+
estimated_tokens = sum(self._estimate_tokens(text) for text in formatted)
255+
self.update_token_usage(
256+
model_name=self.model_name,
257+
provider="local",
258+
prompt_tokens=estimated_tokens,
259+
completion_tokens=0,
260+
)
261+
return results
262+
263+
def get_dimension(self) -> int:
264+
return self._dimension
265+
266+
def close(self):
267+
close_fn = getattr(self._llama, "close", None)
268+
if callable(close_fn):
269+
close_fn()

0 commit comments

Comments
 (0)