diff --git a/README.md b/README.md index 4b95782..edcdea7 100644 --- a/README.md +++ b/README.md @@ -397,7 +397,7 @@ embedding: # `ccc init` auto-populates these for known models (e.g. Cohere, Voyage, Nvidia NIM, # nomic-ai code-retrieval models, Snowflake arctic-embed). # indexing_params: - # input_type: search_document # litellm: input_type, dimensions + # input_type: search_document # litellm: input_type # query_params: # input_type: search_query # sentence-transformers: prompt_name @@ -427,7 +427,7 @@ embedding: OpenAI embeddings (`text-embedding-3-*`, `text-embedding-ada-002`) are intentionally not in the list: they're symmetric and have no equivalent knob. -**Accepted keys:** `prompt_name` (sentence-transformers), `input_type` and `dimensions` (litellm). Other keys are rejected at daemon startup with a clear error. +**Accepted keys:** `prompt_name` (sentence-transformers) and `input_type` (litellm). Other keys are rejected at daemon startup with a clear error. Note: `dimensions` is intentionally not exposed here — output dimension must be identical for indexing and query, so it's a model-wide setting rather than a per-side knob. **Doctor checks both sides.** `ccc doctor` exercises the model once with `indexing_params` and once with `query_params`, reporting each as a separate `Model Check (indexing)` / `Model Check (query)` entry — so a misconfiguration on one side is diagnosable without hiding behind the other. diff --git a/src/cocoindex_code/daemon.py b/src/cocoindex_code/daemon.py index 36614ed..41334bc 100644 --- a/src/cocoindex_code/daemon.py +++ b/src/cocoindex_code/daemon.py @@ -586,7 +586,7 @@ def run_daemon() -> None: handshake_warnings.append( _build_backward_compat_warning(user_settings, user_settings_path()) ) - embedder = create_embedder(user_settings.embedding) + embedder = create_embedder(user_settings.embedding, indexing_params=indexing_params) else: settings_env_keys = [] embedder = None diff --git a/src/cocoindex_code/embedder_params.py b/src/cocoindex_code/embedder_params.py index 1bbe1ba..01efaf9 100644 --- a/src/cocoindex_code/embedder_params.py +++ b/src/cocoindex_code/embedder_params.py @@ -21,13 +21,14 @@ # Accepted kwargs per provider. Intentionally minimal — we only expose knobs -# that users have reason to tune. ``normalize_embeddings`` (sentence- -# transformers) and ``encoding_format`` (litellm) are deliberately excluded -# because other code assumes unit vectors (query._l2_to_score) and float -# payloads (litellm_embedder hardcodes encoding_format="float"). +# that users have reason to tune AND that make sense per-side (indexing vs +# query). Excluded keys: +# - ``normalize_embeddings`` (sentence-transformers): query._l2_to_score +# assumes unit vectors. +# - ``encoding_format`` (litellm): litellm_embedder hardcodes "float". _ACCEPTED_KWARGS: dict[str, frozenset[str]] = { "sentence-transformers": frozenset({"prompt_name"}), - "litellm": frozenset({"input_type", "dimensions"}), + "litellm": frozenset({"input_type"}), } diff --git a/src/cocoindex_code/settings.py b/src/cocoindex_code/settings.py index 1aafd07..afee984 100644 --- a/src/cocoindex_code/settings.py +++ b/src/cocoindex_code/settings.py @@ -544,7 +544,7 @@ def save_user_settings(settings: UserSettings) -> Path: "litellm": ( " #\n" " # Extra kwargs passed to the embedder. Supported keys:\n" - " # input_type, dimensions\n" + " # input_type\n" " # indexing_params: {}\n" " # query_params: {}\n" ), diff --git a/src/cocoindex_code/shared.py b/src/cocoindex_code/shared.py index 1fd7a50..b42e722 100644 --- a/src/cocoindex_code/shared.py +++ b/src/cocoindex_code/shared.py @@ -76,8 +76,26 @@ async def check_embedding( return EmbeddingCheckResult(dim=None, error=msg) -def create_embedder(settings: EmbeddingSettings) -> Embedder: - """Create and return an embedder instance based on settings.""" +def create_embedder( + settings: EmbeddingSettings, + indexing_params: dict[str, Any] | None = None, +) -> Embedder: + """Create and return an embedder instance based on settings. + + For LiteLLM embedders, *indexing_params* (e.g. ``{"input_type": "passage"}``) + are passed to the constructor as default kwargs forwarded into every + ``litellm.aembedding`` call — including paths that don't go through + :data:`INDEXING_EMBED_PARAMS` (e.g. the dimension probe in ``_get_dim``, + or any helper that calls ``embed()`` with no per-side kwargs). Per-call + overrides (the ``query_params`` spread at query time) still take effect + because :meth:`LiteLLMEmbedder._embed` overlays kwargs on top of the + constructor's ``self._kwargs``. + + *indexing_params* is ignored for sentence-transformers — its constructor + doesn't accept arbitrary kwargs; ``prompt_name`` is a per-call argument + only and the indexing default is supplied at the call site via + :data:`INDEXING_EMBED_PARAMS`. + """ if settings.provider == "sentence-transformers": from cocoindex.ops.sentence_transformers import SentenceTransformerEmbedder @@ -103,6 +121,7 @@ def create_embedder(settings: EmbeddingSettings) -> Embedder: instance = PacedLiteLLMEmbedder( settings.model, min_interval_ms=min_interval_ms, + **(dict(indexing_params) if indexing_params else {}), ) logger.info( "Embedding model (LiteLLM): %s | min_interval_ms: %s", diff --git a/tests/test_embedder_params.py b/tests/test_embedder_params.py index b7a73bb..acd6f1d 100644 --- a/tests/test_embedder_params.py +++ b/tests/test_embedder_params.py @@ -14,9 +14,13 @@ def test_validate_params_accepts_known_keys() -> None: validate_params("sentence-transformers", {}, {"prompt_name": "query"}) - validate_params( - "litellm", {"input_type": "passage"}, {"input_type": "query", "dimensions": 512} - ) + validate_params("litellm", {"input_type": "passage"}, {"input_type": "query"}) + + +def test_validate_params_rejects_dimensions() -> None: + """`dimensions` is a model-wide setting, not a per-side knob — must be rejected.""" + with pytest.raises(ValueError, match="dimensions"): + validate_params("litellm", {"dimensions": 512}, {}) def test_validate_params_rejects_unknown_key() -> None: diff --git a/tests/test_settings.py b/tests/test_settings.py index fe3a415..6c06af1 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -620,4 +620,6 @@ def test_save_initial_writes_comment_template_for_unknown_litellm() -> None: assert "# indexing_params: {}" in content assert "# query_params: {}" in content assert "input_type" in content - assert "dimensions" in content + # `dimensions` is intentionally NOT in the litellm template — it must be + # the same on both sides, so we don't expose it as a per-side knob. + assert "dimensions" not in content diff --git a/tests/test_shared.py b/tests/test_shared.py index f0e3ef1..500e9ce 100644 --- a/tests/test_shared.py +++ b/tests/test_shared.py @@ -39,6 +39,33 @@ def test_create_embedder_uses_paced_litellm_embedder() -> None: assert embedder._min_request_interval_seconds == 0.3 +def test_create_embedder_litellm_passes_indexing_params_as_constructor_default() -> None: + """Indexing params become default kwargs forwarded into every litellm call — + covering paths that don't go through INDEXING_EMBED_PARAMS (dim probe, etc.). + """ + embedder = create_embedder( + EmbeddingSettings(provider="litellm", model="cohere/embed-english-v3.0"), + indexing_params={"input_type": "search_document"}, + ) + assert isinstance(embedder, PacedLiteLLMEmbedder) + assert embedder._kwargs == {"input_type": "search_document"} + + +def test_create_embedder_sentence_transformers_ignores_indexing_params() -> None: + """The SentenceTransformer constructor doesn't accept arbitrary kwargs; + indexing_params is silently ignored for that provider. + """ + embedder = create_embedder( + EmbeddingSettings( + provider="sentence-transformers", model="sentence-transformers/all-MiniLM-L6-v2" + ), + indexing_params={"prompt_name": "passage"}, + ) + # No exception, and prompt_name is not stashed on the constructor — + # it's a per-call argument supplied via the embed() call site. + assert not isinstance(embedder, PacedLiteLLMEmbedder) + + def test_is_sentence_transformers_installed_true_in_dev() -> None: # Dev env pulls in sentence-transformers via the `dev` extras group. assert is_sentence_transformers_installed() is True