feat: support external embedding service for scaling (#248)

kyteinsky · web-flow · commit 6ad9d27b955a · 2025-12-12T13:28:43.000+05:30
tested with (using both config.yaml and env vars):
- internal main_em.py service
- external local embedding service
- remotely hosted embedding service (IONOS)

---------

Signed-off-by: Anupam Kumar &lt;kyteinsky@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -116,6 +116,13 @@ Make sure to restart the app after changing the config file. For docker, this wo
 
 This is a file copied from one of the two configurations (config.cpu.yaml or config.gpu.conf) during app startup if `config.yaml` is not already present to the persistent storage. See [Repair section](#repair) on details on the repair step that removes the config if you have a custom config.
 
+The default way is to spawn an embedding server backed by llama.cpp, where the local model runs on either CPU or GPU. The other option is to use a remote model from a OpenAI-compatible API. The configuration for the remote model is also present in the sample config files.
+API key or username/password for the remote API can be stored in the config file itself or environment variables can be used. `CCB_EM_APIKEY` for the API key and `CCB_EM_USERNAME` and `CCB_EM_PASSWORD` for the username and password respectively.
+To indicate the use of environment variables, set the value of `auth` in the config file to `from_env`, like so:
+```yaml
+auth: from_env
+```
+
 ## Repair
 v2.1.0 introduces repair steps. These run on app startup.
 
diff --git a/appinfo/info.xml b/appinfo/info.xml
@@ -56,6 +56,31 @@ Setup background job workers as described here: https://docs.nextcloud.com/serve
 				<display-name>Auto-download models from Huggingface</display-name>
 				<description>When set to "false", "0" or "no", initial download of the Huggingface models will be skipped in the init phase. They would have to be downloaded and placed in the persistent storage manually or through a mount point.</description>
 			</variable>
+			<variable>
+				<name>CC_EM_BASE_URL</name>
+				<display-name>External OpenAI-compatible endpoint</display-name>
+				<description>Set this to an OpenAI-compatible endpoint like https://api.my-local-llm.lan/v1. When set, the internal embedding server is not started. For authentication, set CC_EM_APIKEY or CC_EM_USERNAME and CC_EM_PASSWORD as needed.</description>
+			</variable>
+			<variable>
+				<name>CC_EM_MODEL_NAME</name>
+				<display-name>External embedding model name</display-name>
+				<description>Model name to be used with the OpenAI-compatible endpoint set in CC_EM_BASE_URL. For example, "text-embedding-3-small" or any other model supported by the endpoint. If unset, no model name is sent in the requests.</description>
+			</variable>
+			<variable>
+				<name>CC_EM_APIKEY</name>
+				<display-name>API key for authentication to CC_EM_BASE_URL</display-name>
+				<description>API key to be used for authenticating requests to the OpenAI-compatible endpoint set in CC_EM_BASE_URL. Either this or CC_EM_USERNAME and CC_EM_PASSWORD should be set if the endpoint requires authentication.</description>
+			</variable>
+			<variable>
+				<name>CC_EM_USERNAME</name>
+				<display-name>Username for authentication to CC_EM_BASE_URL</display-name>
+				<description>Username to be used for authenticating requests to the OpenAI-compatible endpoint set in CC_EM_BASE_URL.</description>
+			</variable>
+			<variable>
+				<name>CC_EM_PASSWORD</name>
+				<display-name>Password for authentication to CC_EM_BASE_URL</display-name>
+				<description>Password to be used for authenticating requests to the OpenAI-compatible endpoint set in CC_EM_BASE_URL.</description>
+			</variable>
 		</environment-variables>
 	</external-app>
 </info>
diff --git a/config.cpu.yaml b/config.cpu.yaml
@@ -16,12 +16,21 @@ vectordb:
     # 'connection' overrides the env var 'CCB_DB_URL'
 
 embedding:
-  protocol: http
-  host: localhost
-  port: 5000
+  # embedding service config
+  # for external embedding service, set CC_EM_BASE_URL and CC_EM_APIKEY env vars during deployment
+  # if the env vars are set, this config is ignored
+  # request_timeout is always respected even for remote service
+  base_url: http://localhost:5000/v1
   workers: 1
-  offload_after_mins: 15 # in minutes
   request_timeout: 1800 # in seconds
+  # only for external embedding service
+  # remote_service: true
+  # model_name: text-embedding-3-small
+  # auth:
+  #   apikey: your_api_key_here
+  #   # -or-
+  #   username: your_username_here
+  #   password: your_password_here
   llama:
     # all options: https://python.langchain.com/api_reference/community/embeddings/langchain_community.embeddings.llamacpp.LlamaCppEmbeddings.html
     # 'model_alias' is reserved
@@ -30,6 +39,7 @@ embedding:
     n_batch: 16
     n_ctx: 8192
 
+
 llm:
   nc_texttotext:
 
diff --git a/config.gpu.yaml b/config.gpu.yaml
@@ -16,12 +16,21 @@ vectordb:
     # 'connection' overrides the env var 'CCB_DB_URL'
 
 embedding:
-  protocol: http
-  host: localhost
-  port: 5000
+  # embedding service config
+  # for external embedding service, set CC_EM_BASE_URL and CC_EM_APIKEY env vars during deployment
+  # if the env vars are set, this config is ignored
+  # request_timeout is always respected even for remote service
+  base_url: http://localhost:5000/v1
   workers: 1
-  offload_after_mins: 15 # in minutes
   request_timeout: 1800 # in seconds
+  # only for external embedding service
+  # remote_service: true
+  # model_name: text-embedding-3-small
+  # auth:
+  #   apikey: your_api_key_here
+  #   # -or-
+  #   username: your_username_here
+  #   password: your_password_here
   llama:
     # all options: https://python.langchain.com/api_reference/community/embeddings/langchain_community.embeddings.llamacpp.LlamaCppEmbeddings.html
     # 'model_alias' is reserved
@@ -31,6 +40,7 @@ embedding:
     n_ctx: 8192
     n_gpu_layers: -1
 
+
 llm:
   nc_texttotext:
 
diff --git a/context_chat_backend/config_parser.py b/context_chat_backend/config_parser.py
@@ -2,10 +2,13 @@
 # SPDX-FileCopyrightText: 2024 Nextcloud GmbH and Nextcloud contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
+import os
+
 from ruamel.yaml import YAML
 
 from .models.loader import models
-from .types import TConfig
+from .types import TConfig, TEmbeddingAuthApiKey, TEmbeddingAuthBasic, TEmbeddingConfig
+from .utils import value_of
 from .vectordb.loader import vector_dbs
 
 
@@ -47,6 +50,58 @@ def get_config(file_path: str) -> TConfig:
 			f'Error: llm model should be at least one of {models["llm"]} in the config file'
 		)
 
+	# convert protocol, host and port to base_url
+	embedding = config.get('embedding')
+	if (embedding is None or not isinstance(embedding, dict)) and not os.getenv('CC_EM_BASE_URL'):
+		raise AssertionError(
+			'Error: "embedding" key should be defined in the config file or CC_EM_BASE_URL env var should be set in the'
+			' Deploy Options.'
+		)
+
+	if os.getenv('CC_EM_BASE_URL'):
+		if os.getenv('CC_EM_APIKEY'):
+			auth = TEmbeddingAuthApiKey(apikey=os.environ['CC_EM_APIKEY'])
+		elif os.getenv('CC_EM_USERNAME') and os.getenv('CC_EM_PASSWORD'):
+			auth = TEmbeddingAuthBasic(
+				username=os.environ['CC_EM_USERNAME'],
+				password=os.environ['CC_EM_PASSWORD'],
+			)
+		else:
+			auth = None
+
+		try:
+			# override embedding config from env vars
+			embedding_config = TEmbeddingConfig(
+				base_url=os.environ['CC_EM_BASE_URL'],
+				model_name=value_of(os.getenv('CC_EM_MODEL_NAME', None)),
+				auth=auth,
+				remote_service=True,
+				workers=0,
+				request_timeout=embedding.get('request_timeout', 1800) if embedding else 1800,
+			)
+		except Exception as e:
+			raise AssertionError(
+				'Error: could not create embedding config from env vars'
+			) from e
+
+	elif embedding is None:
+		raise AssertionError(
+			'Error: "embedding" key should be defined in the config file if CC_EM_BASE_URL env var is not set in the'
+			' Deploy Options.'
+		)
+	else:
+		# embedding from config file
+		if 'protocol' in embedding and 'host' in embedding and 'port' in embedding:
+			embedding['base_url'] = f"{embedding['protocol']}://{embedding['host']}:{embedding['port']}/v1"
+			del embedding['protocol']
+			del embedding['host']
+			del embedding['port']
+
+		try:
+			embedding_config = TEmbeddingConfig(**embedding)
+		except Exception as e:
+			raise AssertionError('Error: could not create embedding config from config file') from e
+
 	return TConfig(
 		debug=config.get('debug', False),
 		uvicorn_log_level=config.get('uvicorn_log_level', 'info'),
@@ -58,6 +113,6 @@ def get_config(file_path: str) -> TConfig:
 		doc_parser_worker_limit=config.get('doc_parser_worker_limit', 10),
 
 		vectordb=vectordb,
-		embedding=config.get('embedding', {}), # for a more appropriate response
+		embedding=embedding_config,
 		llm=llm,
 	)
diff --git a/context_chat_backend/network_em.py b/context_chat_backend/network_em.py
@@ -3,14 +3,22 @@
 # SPDX-License-Identifier: AGPL-3.0-or-later
 #
 import logging
+from collections.abc import Generator
 from time import sleep
 from typing import Literal, TypedDict
 
 import httpx
 from langchain_core.embeddings import Embeddings
 from pydantic import BaseModel
 
-from .types import EmbeddingException, RetryableEmbeddingException, TConfig
+from .types import (
+	EmbeddingException,
+	FatalEmbeddingException,
+	RetryableEmbeddingException,
+	TConfig,
+	TEmbeddingAuthApiKey,
+	TEmbeddingAuthBasic,
+)
 
 logger = logging.getLogger('ccb.nextwork_em')
 
@@ -34,6 +42,15 @@ class CreateEmbeddingResponse(TypedDict):
 	usage: EmbeddingUsage
 
 
+class ApiKeyAuth(httpx.Auth):
+	def __init__(self, apikey: str | bytes) -> None:
+		self._apikey = apikey
+
+	def auth_flow(self, request: httpx.Request) -> Generator[httpx.Request, httpx.Response, None]:
+		request.headers['Authorization'] = f'Bearer {self._apikey}'
+		yield request
+
+
 class NetworkEmbeddings(Embeddings, BaseModel):
 	app_config: TConfig
 
@@ -47,14 +64,32 @@ def _get_embedding(self, input_: str | list[str], try_: int = 3) -> list[float]
 		)
 
 		try:
-			with httpx.Client() as client:
+			match emconf.auth:
+				case None:
+					auth = httpx.USE_CLIENT_DEFAULT
+				case TEmbeddingAuthApiKey(apikey=apikey):
+					auth = ApiKeyAuth(apikey=apikey)
+				case TEmbeddingAuthBasic(username=username, password=password):
+					auth = httpx.BasicAuth(username=username, password=password)
+
+			data = {'input': input_}
+			if emconf.model_name:
+				data['model'] = emconf.model_name
+
+			with httpx.Client(verify=self.app_config.httpx_verify_ssl) as client:
 				response = client.post(
-					f'{emconf.protocol}://{emconf.host}:{emconf.port}/v1/embeddings',
-					json={'input': input_},
+					f'{emconf.base_url.removesuffix("/")}/embeddings',
+					json=data,
 					timeout=emconf.request_timeout,
+					auth=auth,
 				)
-				if response.status_code != 200:
+				if response.status_code // 100 == 4:
+					raise FatalEmbeddingException(response.text)
+				if response.status_code // 100 != 2:
 					raise EmbeddingException(response.text)
+		except FatalEmbeddingException as e:
+			logger.error('Fatal error while getting embeddings: %s', str(e), exc_info=e)
+			raise e
 		except (
 			EmbeddingException,
 			httpx.RemoteProtocolError,
diff --git a/context_chat_backend/types.py b/context_chat_backend/types.py
@@ -5,20 +5,33 @@
 from pydantic import BaseModel
 
 __all__ = [
+	'DEFAULT_EM_MODEL_ALIAS',
 	'EmbeddingException',
 	'LoaderException',
 	'TConfig',
-	'TEmbedding',
+	'TEmbeddingAuthApiKey',
+	'TEmbeddingAuthBasic',
+	'TEmbeddingConfig',
 ]
 
-class TEmbedding(BaseModel):
-	protocol: str
-	host: str
-	port: int
-	workers: int
-	offload_after_mins: int
-	request_timeout: int
-	llama: dict
+DEFAULT_EM_MODEL_ALIAS = 'em_model'
+
+
+class TEmbeddingAuthApiKey(BaseModel):
+	apikey: str
+
+class TEmbeddingAuthBasic(BaseModel):
+	username: str
+	password: str
+
+class TEmbeddingConfig(BaseModel):
+	base_url: str = 'http://localhost:5000/v1'
+	workers: int = 1
+	request_timeout: int = 1750
+	model_name: str | None = DEFAULT_EM_MODEL_ALIAS
+	auth: TEmbeddingAuthApiKey | TEmbeddingAuthBasic | None = None
+	remote_service: bool = False
+	llama: dict = dict()  # noqa: C408
 
 
 class TConfig(BaseModel):
@@ -32,7 +45,7 @@ class TConfig(BaseModel):
 	doc_parser_worker_limit: int
 
 	vectordb: tuple[str, dict]
-	embedding: TEmbedding
+	embedding: TEmbeddingConfig
 	llm: tuple[str, dict]
 
 
@@ -50,3 +63,10 @@ class RetryableEmbeddingException(EmbeddingException):
 	This keeps the indexing loop running and adds to the retry list.
 	The parent exception would break the loop and stop the indexing process.
 	"""
+
+class FatalEmbeddingException(EmbeddingException):
+	"""
+	Exception that indicates a fatal error in the embedding request.
+
+	Either malformed request, authentication error, or other non-retryable error.
+	"""
diff --git a/context_chat_backend/utils.py b/context_chat_backend/utils.py
@@ -14,6 +14,8 @@
 
 from fastapi.responses import JSONResponse as FastAPIJSONResponse
 
+from .types import TConfig, TEmbeddingAuthApiKey, TEmbeddingAuthBasic, TEmbeddingConfig
+
 T = TypeVar('T')
 _logger = logging.getLogger('ccb.utils')
 
@@ -121,3 +123,22 @@ def wrapper(*args, **kwargs):
 		return res
 
 	return wrapper
+
+
+def redact_config(config: TConfig | TEmbeddingConfig) -> TConfig | TEmbeddingConfig:
+	'''
+	Redact sensitive information from the config for logging
+	'''
+	if isinstance(config, TConfig):
+		em_conf = config.embedding
+	else:
+		em_conf = config
+
+	if em_conf.auth:
+		if isinstance(em_conf.auth, TEmbeddingAuthApiKey):
+			em_conf.auth.apikey = '***REDACTED***'
+		elif isinstance(em_conf.auth, TEmbeddingAuthBasic):
+			em_conf.auth.username = '***REDACTED***'
+			em_conf.auth.password = '***REDACTED***'  # noqa: S105
+
+	return config
diff --git a/main.py b/main.py
@@ -12,6 +12,7 @@
 from context_chat_backend.types import TConfig  # isort: skip
 from context_chat_backend.controller import app  # isort: skip
 from context_chat_backend.logger import get_logging_config, setup_logging  # isort: skip
+from context_chat_backend.utils import redact_config  # isort: skip
 
 LOGGER_CONFIG_NAME = 'logger_config.yaml'
 
@@ -47,7 +48,7 @@ def _setup_log_levels(debug: bool):
 	app_config: TConfig = app.extra['CONFIG']
 	_setup_log_levels(app_config.debug)
 
-	print('App config:\n' + app_config.model_dump_json(indent=2), flush=True)
+	print('App config:\n' + redact_config(app_config).model_dump_json(indent=2), flush=True)
 
 	uv_log_config = uvicorn.config.LOGGING_CONFIG  # pyright: ignore[reportAttributeAccessIssue]
 	uv_log_config['formatters']['json'] = logging_config['formatters']['json']
diff --git a/main_em.py b/main_em.py